In [1]:
import numpy as np
import pandas as pd

## COVID - 19 Data From Jhon Hopkins University  

Data Link -> [GitHub](https://github.com/CSSEGISandData/COVID-19)

In [2]:
total_case_link = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths_link = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
recovered_link = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
consise_link = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv"

## Defining Some Utility Function to Sort and Clean Data

In [3]:
def dropoff(dfs, data_left):
    for i in range(0, len(data_left)):
        ele = data_left[i]
        idx = dfs[dfs.country == ele].index
        dfs.drop(idx, axis = 0, inplace = True)
    
    return dfs

In [4]:
def order_data(dframes):
    dframe = pd.DataFrame(dframes)
    dframe.drop(['Lat', 'Long'], axis = 1, inplace = True)
    dframe.rename(columns={'Province/State':'province','Country/Region':'country'}, inplace = True)
    
    #----------------------------------------------------------
    country_list = np.sort(np.array(dframe.country.unique()))
    repeated = {}
    rep_country = []
    rep_first_dict = {}
    
    for i in range(0, len(country_list)):
        idx = np.array(dframe[dframe.country == country_list[i]].index)
        
        if len(idx) > 1:
            repeated[country_list[i]] = idx
            rep_country.append(country_list[i])
            rep_first_dict[idx[0]] = country_list[i]
        
        else:
            continue
    
    #---------------------------------------------------------
    to_drop = []
    to_change = []
    
    for i in range(0, len(rep_country)):
        nation = rep_country[i]
        index = repeated[nation]
        first_index = index[0]
        to_change.append(first_index)
        
        last_index = index[1:]
        for j in last_index:
            to_drop.append(j)
        
        dframe.iloc[first_index] = np.sum(dframe.iloc[index], axis = 0)
    
    
    #----------------------------------------------------------
    for i in to_change:
        dframe.loc[i, 'country'] = rep_first_dict[i]
    
    
    #----------------------------------------------------------
    dframe.drop(to_drop, inplace = True)
    
    
    #----------------------------------------------------------
    to_drop = ['Diamond Princess', 'Kosovo', 'MS Zaandam']

    new_frame = dropoff(dframe, to_drop)
    new_frame = dframe.drop(columns=['province'])
    new_frame.set_index('country', inplace = True)
    
    #--------------------
    return new_frame

In [5]:
def clean_data(dframe, dframe1, dframe2):
    cno = dframe.shape[0]
    dno = dframe.shape[1]
    entrys = cno*dno
    date = np.array(dframe.columns)
    country_list = np.array(dframe.index)
    
    nations = []
    dates = []
    values = []
    deaths = []
    recovery = []
    
    for i in range(0, cno):
        for j in range(0, dno):
            nations.append(country_list[i])
            dates.append(date[j])
            val = dframe.get_value(country_list[i], date[j])
            values.append(val)
    
    for i in range(0, cno):
        for j in range(0, dno):
            val1 = dframe1.get_value(country_list[i], date[j])
            recovery.append(val1)
    
    for i in range(0, cno):
        for j in range(0, dno):
            val2 = dframe2.get_value(country_list[i], date[j])
            deaths.append(val2)
    
    values = np.array(values).astype(np.int32)
    recovery = np.array(recovery).astype(np.int32)
    deaths = np.array(deaths).astype(np.int32)
    res = {'country':nations, 'date':dates, 'total_cases':values, 'recovered_cases':recovery, 'total_deaths':deaths}
    rec = pd.DataFrame(res)
    rec['date'] = pd.to_datetime(rec['date'])
    rec.fillna(0)
    
    return rec

In [6]:
def get_timeSeries_data(total_case_link, recovered_link, deaths_link):
    
    total = pd.read_csv(total_case_link)
    recovered = pd.read_csv(recovered_link)
    deaths = pd.read_csv(deaths_link)

    dframe = order_data(total)
    dframe1 = order_data(recovered)
    dframe2 = order_data(deaths)

    df = clean_data(dframe, dframe1, dframe2)

    to_drop = ['Diamond Princess', 'Kosovo', 'MS Zaandam']

    df_final = dropoff(df, to_drop)

    return df_final

In [7]:
def get_cumulativeCountry_data(consise_link):

    datac = pd.read_csv(consise_link)
    data = pd.DataFrame(datac)

    data = data.drop(['People_Tested', 'People_Hospitalized'], axis =1)
    data['Last_Update'] = pd.to_datetime(data['Last_Update']) 
    data = data.rename(columns = {'Country_Region':'country'})

    to_drop_from_data = ['Diamond Princess', 'Kosovo', 'MS Zaandam']

    cumulative = dropoff(data, to_drop_from_data)
    
    cumulative.set_index('country', inplace = True)

    return cumulative

## Downloading time series data into a DataFrame

In [8]:
df = get_timeSeries_data(total_case_link, recovered_link, deaths_link)



In [9]:
df

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths
0,Afghanistan,2020-01-22,0,0,0
1,Afghanistan,2020-01-23,0,0,0
2,Afghanistan,2020-01-24,0,0,0
3,Afghanistan,2020-01-25,0,0,0
4,Afghanistan,2020-01-26,0,0,0
...,...,...,...,...,...
31815,Lesotho,2020-07-07,91,11,0
31816,Lesotho,2020-07-08,91,11,0
31817,Lesotho,2020-07-09,134,20,1
31818,Lesotho,2020-07-10,184,26,1


## Downloading Cumulative COVID 19 country data into a DataFrame

In [10]:
cumulative = get_cumulativeCountry_data(consise_link)

In [11]:
cumulative

Unnamed: 0_level_0,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Mortality_Rate,UID,ISO3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Australia,2020-07-12 05:34:33,-25.000000,133.000000,9797.0,108.0,7727.0,1962.0,38.480422,1.102378,36,AUS
Austria,2020-07-12 05:34:33,47.516200,14.550100,18783.0,706.0,16864.0,1213.0,208.551697,3.758718,40,AUT
Canada,2020-07-12 05:34:33,60.001000,-95.001000,109150.0,8818.0,72784.0,27549.0,288.331729,8.078791,124,CAN
China,2020-07-12 05:34:33,30.592800,114.305500,85071.0,4641.0,79876.0,554.0,6.056271,5.455443,156,CHN
Denmark,2020-07-12 05:34:33,56.263900,9.501800,13147.0,609.0,12278.0,260.0,226.977542,4.632235,208,DNK
...,...,...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,2020-07-12 05:34:33,31.952200,35.233200,5931.0,33.0,536.0,5362.0,116.261838,0.556399,275,PSE
Western Sahara,2020-07-12 05:34:33,24.215500,-12.885800,10.0,1.0,8.0,1.0,1.674116,10.000000,732,ESH
Yemen,2020-07-12 05:34:33,15.552727,48.516388,1389.0,365.0,642.0,382.0,4.657016,26.277898,887,YEM
Zambia,2020-07-12 05:34:33,-13.133897,27.849332,1895.0,42.0,1348.0,505.0,10.307901,2.216359,894,ZMB


## Importing Population Data

In [12]:
population = pd.read_csv('population_data.csv')
population.drop(['Unnamed: 0'], axis = 1, inplace = True)
population.set_index('country', inplace = True)
population

Unnamed: 0_level_0,population,median_age,urban_percentage_share,world_share
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,1438207241,38,61 %,18.47 %
India,1377233523,28,35 %,17.70 %
US,330610570,38,83 %,4.25 %
Indonesia,272931713,30,56 %,3.51 %
Pakistan,219992900,23,35 %,2.83 %
...,...,...,...,...
Saint Kitts and Nevis,53123,N.A.,33 %,0.00 %
Monaco,39186,N.A.,N.A.,0.00 %
Liechtenstein,38106,N.A.,15 %,0.00 %
San Marino,33917,N.A.,97 %,0.00 %


## Studying Data

In [13]:
for i in range(0, 185):
    print(population['urban_percentage_share'][i])

61 %
35 %
83 %
56 %
35 %
88 %
52 %
39 %
74 %
84 %
92 %
21 %
47 %
43 %
38 %
46 %
76 %
76 %
76 %
51 %
83 %
82 %
69 %
37 %
67 %
31 %
28 %
82 %
80 %
80 %
26 %
93 %
73 %
35 %
69 %
73 %
25 %
60 %
81 %
64 %
84 %
50 %
79 %
67 %
78 %
38 %
57 %
38 %
21 %
N.A.
39 %
56 %
51 %
86 %
17 %
79 %
18 %
31 %
44 %
55 %
18 %
85 %
58 %
45 %
52 %
63 %
60 %
92 %
49 %
24 %
23 %
47 %
38 %
39 %
18 %
48 %
14 %
70 %
69 %
98 %
57 %
78 %
25 %
85 %
74 %
85 %
91 %
66 %
56 %
88 %
57 %
86 %
72 %
27 %
79 %
57 %
13 %
56 %
93 %
74 %
43 %
43 %
36 %
62 %
76 %
78 %
78 %
57 %
36 %
73 %
N.A.
88 %
86 %
70 %
54 %
83 %
87 %
80 %
80 %
53 %
63 %
43 %
87 %
57 %
68 %
N.A.
58 %
43 %
58 %
63 %
96 %
52 %
67 %
63 %
55 %
96 %
63 %
71 %
55 %
59 %
73 %
87 %
31 %
59 %
55 %
45 %
69 %
89 %
73 %
52 %
68 %
33 %
41 %
67 %
30 %
79 %
59 %
29 %
27 %
46 %
68 %
88 %
87 %
65 %
68 %
35 %
93 %
80 %
46 %
86 %
94 %
31 %
74 %
19 %
35 %
53 %
56 %
26 %
88 %
74 %
33 %
N.A.
15 %
97 %
N.A.


In [14]:
for i in range(0, 185):
    print(population['median_age'][i], " years")

38  years
28  years
38  years
30  years
23  years
33  years
18  years
28  years
40  years
29  years
48  years
19  years
26  years
25  years
32  years
17  years
32  years
32  years
46  years
40  years
40  years
42  years
47  years
18  years
28  years
29  years
20  years
44  years
31  years
45  years
17  years
32  years
29  years
20  years
41  years
21  years
18  years
42  years
41  years
30  years
32  years
28  years
31  years
17  years
30  years
18  years
22  years
20  years
25  years
30  years
20  years
19  years
19  years
38  years
15  years
42  years
34  years
18  years
16  years
43  years
18  years
35  years
31  years
18  years
23  years
28  years
26  years
43  years
19  years
26  years
17  years
17  years
19  years
18  years
20  years
19  years
17  years
33  years
26  years
42  years
24  years
42  years
19  years
28  years
43  years
46  years
24  years
46  years
32  years
41  years
24  years
33  years
43  years
22  years
40  years
43  years
22  years
42  years
30  years
43  years


In [15]:
df.head()

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths
0,Afghanistan,2020-01-22,0,0,0
1,Afghanistan,2020-01-23,0,0,0
2,Afghanistan,2020-01-24,0,0,0
3,Afghanistan,2020-01-25,0,0,0
4,Afghanistan,2020-01-26,0,0,0


In [16]:
df['new_reported_cases'] = [0]*df.shape[0]
df['daily_recovered_cases'] = [0]*df.shape[0]
df['daily_deaths'] = [0]*df.shape[0]
df['active_cases'] = [0]*df.shape[0]
df['cases_growth_rate_in_percent'] = [0]*df.shape[0]

for i in range(1, df.shape[0]):
    df['new_reported_cases'][i] = df['total_cases'][i] -df['total_cases'][i-1] 
    df['daily_recovered_cases'][i] = df['recovered_cases'][i] -df['recovered_cases'][i-1]
    df['daily_deaths'][i] = df['total_deaths'][i] -df['total_deaths'][i-1]
    df['active_cases'][i] = df['total_cases'][i] -df['recovered_cases'][i] - df['total_deaths'][i]
    df['cases_growth_rate_in_percent'][i] = (df['new_reported_cases'][i]/df['total_cases'][i-1])*100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
A value is t

In [17]:
days = int(df.shape[0]/185)
indexs1 = []
for i in range(0,185):
    indexs1.append(i*days)


df.loc[indexs1 , 'new_reported_cases'] = 0
df.loc[indexs1 , 'daily_recovered_cases'] = 0
df.loc[indexs1 , 'daily_deaths'] = 0
df.loc[indexs1 , 'cases_growth_rate_in_percent'] = 0
df.head()

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent
0,Afghanistan,2020-01-22,0,0,0,0,0,0,0,0.0
1,Afghanistan,2020-01-23,0,0,0,0,0,0,0,
2,Afghanistan,2020-01-24,0,0,0,0,0,0,0,
3,Afghanistan,2020-01-25,0,0,0,0,0,0,0,
4,Afghanistan,2020-01-26,0,0,0,0,0,0,0,


In [18]:
df['mortality_rate'] = df['total_deaths']/df['total_cases']*100
df['recovery_rate'] = df['recovered_cases']/df['total_cases']*100
df = df.fillna(0)

In [19]:
df[df.country == 'India']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate
12040,India,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000
12041,India,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000
12042,India,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000
12043,India,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000
12044,India,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
12207,India,2020-07-07,742417,456831,20642,22753,16897,483,264944,3.161614,2.780378,61.532939
12208,India,2020-07-08,767296,476378,21129,24879,19547,487,269789,3.351082,2.753696,62.085297
12209,India,2020-07-09,793802,495513,21604,26506,19135,475,276685,3.454469,2.721585,62.422745
12210,India,2020-07-10,820916,515386,22123,27114,19873,519,283407,3.415713,2.694916,62.781819


In [20]:
#Calulating doubling time
to_remove = []
for i in range(0,185):
    for j in range(0,6):
        to_remove.append((i*days) + j)


r_avg = df['cases_growth_rate_in_percent'].rolling(7).mean()
dbl = np.log(2)/np.log(1 + (r_avg/100))
df['doubling_time_in_days'] = dbl
df.loc[to_remove, 'doubling_time_in_days'] = 0

In [21]:
df[df.country == 'India']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days
12040,India,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
12041,India,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
12042,India,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
12043,India,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
12044,India,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12207,India,2020-07-07,742417,456831,20642,22753,16897,483,264944,3.161614,2.780378,61.532939,20.429991
12208,India,2020-07-08,767296,476378,21129,24879,19547,487,269789,3.351082,2.753696,62.085297,20.364878
12209,India,2020-07-09,793802,495513,21604,26506,19135,475,276685,3.454469,2.721585,62.422745,20.367046
12210,India,2020-07-10,820916,515386,22123,27114,19873,519,283407,3.415713,2.694916,62.781819,20.554273


In [22]:
#adding active cases per million and total cases per million
cases_per_million = []
active_cases_per_million = []
population_share = []

for i in range(0, df.shape[0]):
    cntry = df['country'][i]
    cases = df['total_cases'][i]
    active = df['active_cases'][i]
    population_size = population['population'][cntry]
    share = population['world_share'][cntry]
    
    entry_cases = cases/population_size*1000000
    cases_per_million.append(entry_cases)
    
    entry_active = active/population_size*1000000
    active_cases_per_million.append(entry_active)
    
    population_share.append(share)

df['total_cases_per_million'] = cases_per_million
df['active_cases_per_million'] = active_cases_per_million
df['world_population_share'] = population_share

In [23]:
df[df.country == 'India']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share
12040,India,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %
12041,India,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %
12042,India,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %
12043,India,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %
12044,India,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12207,India,2020-07-07,742417,456831,20642,22753,16897,483,264944,3.161614,2.780378,61.532939,20.429991,539.063991,192.374057,17.70 %
12208,India,2020-07-08,767296,476378,21129,24879,19547,487,269789,3.351082,2.753696,62.085297,20.364878,557.128466,195.891979,17.70 %
12209,India,2020-07-09,793802,495513,21604,26506,19135,475,276685,3.454469,2.721585,62.422745,20.367046,576.374294,200.899118,17.70 %
12210,India,2020-07-10,820916,515386,22123,27114,19873,519,283407,3.415713,2.694916,62.781819,20.554273,596.061587,205.779917,17.70 %


In [24]:
df[df.country == 'US']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share
26660,US,2020-01-22,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.003025,0.003025,4.25 %
26661,US,2020-01-23,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.003025,0.003025,4.25 %
26662,US,2020-01-24,2,0,0,1,0,0,2,100.000000,0.000000,0.000000,0.000000,0.006049,0.006049,4.25 %
26663,US,2020-01-25,2,0,0,0,0,0,2,0.000000,0.000000,0.000000,0.000000,0.006049,0.006049,4.25 %
26664,US,2020-01-26,5,0,0,3,0,0,5,150.000000,0.000000,0.000000,0.000000,0.015124,0.015124,4.25 %
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26827,US,2020-07-07,2996098,936476,131480,60021,12328,1195,1928142,2.044258,4.388374,31.256521,37.935536,9062.317639,5832.063990,4.25 %
26828,US,2020-07-08,3054699,953462,132300,58601,16986,820,1968937,1.955911,4.331032,31.212961,37.892295,9239.568475,5955.456899,4.25 %
26829,US,2020-07-09,3117946,969111,133290,63247,15649,990,2015545,2.070482,4.274930,31.081712,37.764615,9430.872098,6096.432428,4.25 %
26830,US,2020-07-10,3184573,983185,134092,66627,14074,802,2067296,2.136888,4.210674,30.873370,37.216798,9632.399230,6252.964024,4.25 %


In [25]:
num_country = len(df.country.unique())
num_dates = len(df.date.unique())

country_list = np.array(df.country.unique())
date_list = df.date.unique()

total_case_share = []
active_case_share = []
death_share = []
recovered_case_share = []


total_case_share_by_date = []
active_case_share_by_date = []
death_share_by_date = []
recovered_case_share_by_date = []

for i  in date_list:
    
    total_day_case = df[df.date == i].sum()['total_cases']
    active_day_case = df[df.date == i].sum()['active_cases']
    death_day_case = df[df.date == i].sum()['total_deaths']
    recovered_day_case = df[df.date == i].sum()['recovered_cases']
    
    total_case_share_by_date.append(total_day_case)
    active_case_share_by_date.append(active_day_case)
    death_share_by_date.append(death_day_case)
    recovered_case_share_by_date.append(recovered_day_case)
    
for j in country_list:
    idx = df[df.country == j].index
    
    tot = np.array(df['total_cases'][idx])/total_case_share_by_date*100
    act = np.array(df['active_cases'][idx])/active_case_share_by_date*100
    dea = np.array(df['total_deaths'][idx])/death_share_by_date*100
    reco = np.array(df['recovered_cases'][idx])/recovered_case_share_by_date*100
    
    total_case_share = np.concatenate([total_case_share,tot])
    active_case_share = np.concatenate([active_case_share, act])
    death_share = np.concatenate([death_share, dea])
    recovered_case_share = np.concatenate([recovered_case_share,reco])

df['world_total_cases_share'] = total_case_share
df['world_active_cases_share'] = active_case_share
df['world_deaths_share'] = death_share
df['world_recovered_cases_share'] = recovered_case_share
    

In [26]:
df[df.country == 'India']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
12040,India,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %,0.000000,0.000000,0.000000,0.000000
12041,India,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %,0.000000,0.000000,0.000000,0.000000
12042,India,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %,0.000000,0.000000,0.000000,0.000000
12043,India,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %,0.000000,0.000000,0.000000,0.000000
12044,India,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.70 %,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12207,India,2020-07-07,742417,456831,20642,22753,16897,483,264944,3.161614,2.780378,61.532939,20.429991,539.063991,192.374057,17.70 %,6.278273,5.478521,3.794004,7.088081
12208,India,2020-07-08,767296,476378,21129,24879,19547,487,269789,3.351082,2.753696,62.085297,20.364878,557.128466,195.891979,17.70 %,6.374546,5.523314,3.846035,7.214620
12209,India,2020-07-09,793802,495513,21604,26506,19135,475,276685,3.454469,2.721585,62.422745,20.367046,576.374294,200.899118,17.70 %,6.472779,5.565463,3.893854,7.354651
12210,India,2020-07-10,820916,515386,22123,27114,19873,519,283407,3.415713,2.694916,62.781819,20.554273,596.061587,205.779917,17.70 %,6.570777,5.604735,3.949830,7.494589


In [27]:
df[df.country == 'US']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
26660,US,2020-01-22,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.003025,0.003025,4.25 %,0.180180,0.196078,0.000000,0.000000
26661,US,2020-01-23,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.003025,0.003025,4.25 %,0.152905,0.165017,0.000000,0.000000
26662,US,2020-01-24,2,0,0,1,0,0,2,100.000000,0.000000,0.000000,0.000000,0.006049,0.006049,4.25 %,0.212540,0.227531,0.000000,0.000000
26663,US,2020-01-25,2,0,0,0,0,0,2,0.000000,0.000000,0.000000,0.000000,0.006049,0.006049,4.25 %,0.139470,0.147820,0.000000,0.000000
26664,US,2020-01-26,5,0,0,3,0,0,5,150.000000,0.000000,0.000000,0.000000,0.015124,0.015124,4.25 %,0.236072,0.248756,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26827,US,2020-07-07,2996098,936476,131480,60021,12328,1195,1928142,2.044258,4.388374,31.256521,37.935536,9062.317639,5832.063990,4.25 %,25.336600,39.870183,24.166052,14.530139
26828,US,2020-07-08,3054699,953462,132300,58601,16986,820,1968937,1.955911,4.331032,31.212961,37.892295,9239.568475,5955.456899,4.25 %,25.377845,40.309494,24.082087,14.439933
26829,US,2020-07-09,3117946,969111,133290,63247,15649,990,2015545,2.070482,4.274930,31.081712,37.764615,9430.872098,6096.432428,4.25 %,25.424193,40.542283,24.023878,14.384029
26830,US,2020-07-10,3184573,983185,134092,66627,14074,802,2067296,2.136888,4.210674,30.873370,37.216798,9632.399230,6252.964024,4.25 %,25.489963,40.883414,23.940725,14.297181


In [28]:
df[df.country == 'Russia']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
21156,Russia,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.87 %,0.000000,0.000000,0.000000,0.000000
21157,Russia,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.87 %,0.000000,0.000000,0.000000,0.000000
21158,Russia,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.87 %,0.000000,0.000000,0.000000,0.000000
21159,Russia,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.87 %,0.000000,0.000000,0.000000,0.000000
21160,Russia,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.87 %,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21323,Russia,2020-07-07,693215,463103,10478,6363,9533,198,219634,0.926400,1.511508,66.805104,70.213425,4750.585604,1505.146482,1.87 %,5.862195,4.541599,1.925859,7.185396
21324,Russia,2020-07-08,699749,471718,10650,6534,8615,172,217381,0.942565,1.521974,67.412458,70.923935,4795.362948,1489.706728,1.87 %,5.813379,4.450380,1.938581,7.144046
21325,Russia,2020-07-09,706240,480494,10826,6491,8776,176,214920,0.927618,1.532907,68.035512,72.025197,4839.845613,1472.841554,1.87 %,5.758785,4.323073,1.951253,7.131732
21326,Russia,2020-07-10,712863,488234,11000,6623,7740,174,213629,0.937783,1.543073,68.489177,72.866509,4885.232872,1463.994362,1.87 %,5.705899,4.224786,1.963935,7.099752


In [29]:
df[df.country == 'Brazil']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
3612,Brazil,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.73 %,0.000000,0.000000,0.000000,0.000000
3613,Brazil,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.73 %,0.000000,0.000000,0.000000,0.000000
3614,Brazil,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.73 %,0.000000,0.000000,0.000000,0.000000
3615,Brazil,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.73 %,0.000000,0.000000,0.000000,0.000000
3616,Brazil,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.73 %,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3779,Brazil,2020-07-07,1668589,1107012,66741,45305,44470,1254,494836,2.790947,3.999847,66.344199,27.848304,7861.315604,2331.348204,2.73 %,14.110477,10.232235,12.267010,17.176134
3780,Brazil,2020-07-08,1713160,1139844,67964,44571,32832,1223,505352,2.671179,3.967172,66.534591,28.918177,8071.305420,2380.892816,2.73 %,14.232600,10.345930,12.371239,17.262640
3781,Brazil,2020-07-09,1755779,1171447,69184,42619,31603,1220,515148,2.487742,3.940359,66.719502,30.391616,8272.098671,2427.045252,2.73 %,14.316882,10.362099,12.469562,17.387201
3782,Brazil,2020-07-10,1800827,1217361,70398,45048,45914,1214,513068,2.565699,3.909204,67.600108,30.873857,8484.335804,2417.245633,2.73 %,14.414182,10.146574,12.568827,17.702499


In [30]:
df[df.country == 'China']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
5504,China,2020-01-22,548,28,17,0,0,0,503,0.000000,3.102190,5.109489,0.000000,0.381030,0.349741,18.47 %,98.738739,98.627451,100.000000,100.000000
5505,China,2020-01-23,643,30,18,95,2,1,595,17.335766,2.799378,4.665630,0.000000,0.447084,0.413710,18.47 %,98.318043,98.184818,100.000000,100.000000
5506,China,2020-01-24,920,36,26,277,6,8,858,43.079316,2.826087,3.913043,0.000000,0.639685,0.596576,18.47 %,97.768332,97.610922,100.000000,100.000000
5507,China,2020-01-25,1406,39,42,486,3,16,1325,52.826087,2.987198,2.773826,0.000000,0.977606,0.921286,18.47 %,98.047420,97.930525,100.000000,100.000000
5508,China,2020-01-26,2075,49,56,669,10,14,1970,47.581792,2.698795,2.361446,0.000000,1.442768,1.369761,18.47 %,97.969783,98.009950,100.000000,94.230769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5671,China,2020-07-07,84917,79754,4641,28,29,0,522,0.032984,5.465337,93.919945,3118.880618,59.043647,0.362952,18.47 %,0.718103,0.010794,0.853017,1.237444
5672,China,2020-07-08,84950,79802,4641,33,48,0,507,0.038861,5.463214,93.939965,3073.482756,59.066592,0.352522,18.47 %,0.705748,0.010380,0.844784,1.208580
5673,China,2020-07-09,84992,79802,4641,42,0,0,549,0.049441,5.460514,93.893543,2543.075134,59.095795,0.381725,18.47 %,0.693037,0.011043,0.836483,1.184461
5674,China,2020-07-10,84992,79802,4641,0,0,0,549,0.000000,5.460514,93.893543,2675.274417,59.095795,0.381725,18.47 %,0.680293,0.010857,0.828602,1.160457


In [31]:
##Just Some Testing
arn = pd.DataFrame(df[df.country == 'United Kingdom'])
arn_sm = arn['recovered_cases'].cumsum()
arn_sm

26316         0
26317         0
26318         0
26319         0
26320         0
          ...  
26483     98149
26484     99527
26485    100905
26486    102283
26487    103661
Name: recovered_cases, Length: 172, dtype: int32

In [32]:
df[df.country == 'Spain']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
23564,Spain,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.60 %,0.000000,0.000000,0.000000,0.000000
23565,Spain,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.60 %,0.000000,0.000000,0.000000,0.000000
23566,Spain,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.60 %,0.000000,0.000000,0.000000,0.000000
23567,Spain,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.60 %,0.000000,0.000000,0.000000,0.000000
23568,Spain,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.60 %,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23731,Spain,2020-07-07,252130,150376,28392,341,0,4,73362,0.135431,11.260857,59.642248,425.153762,5393.019534,1569.201202,0.60 %,2.132145,1.516982,5.218456,2.333198
23732,Spain,2020-07-08,252513,150376,28396,383,0,4,73741,0.151906,11.245362,59.551785,426.552389,5401.211841,1577.307950,0.60 %,2.097829,1.509679,5.168820,2.277405
23733,Spain,2020-07-09,253056,150376,28401,543,0,5,74279,0.215038,11.223208,59.424001,413.067505,5412.826523,1588.815682,0.60 %,2.063456,1.494107,5.118930,2.231956
23734,Spain,2020-07-10,253908,150376,28403,852,0,2,75129,0.336684,11.186335,59.224601,363.637401,5431.050663,1606.997043,0.60 %,2.032331,1.485772,5.071059,2.186723


In [33]:
df[df.country == 'Italy']

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
13072,Italy,2020-01-22,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.78 %,0.000000,0.000000,0.000000,0.000000
13073,Italy,2020-01-23,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.78 %,0.000000,0.000000,0.000000,0.000000
13074,Italy,2020-01-24,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.78 %,0.000000,0.000000,0.000000,0.000000
13075,Italy,2020-01-25,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.78 %,0.000000,0.000000,0.000000,0.000000
13076,Italy,2020-01-26,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.78 %,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13239,Italy,2020-07-07,241956,192815,34899,137,574,30,14242,0.056654,14.423697,79.690109,849.507109,4000.633339,235.485047,0.78 %,2.046109,0.294497,6.414444,2.991672
13240,Italy,2020-07-08,242149,193640,34914,193,825,15,13595,0.079767,14.418395,79.967293,843.434791,4003.824507,224.787194,0.78 %,2.011727,0.278327,6.355268,2.932627
13241,Italy,2020-07-09,242363,193978,34926,214,338,12,13459,0.088375,14.410615,80.036144,836.331961,4007.362901,222.538495,0.78 %,1.976264,0.270725,6.294981,2.879118
13242,Italy,2020-07-10,242639,194273,34938,276,295,12,13428,0.113879,14.399169,80.066683,806.694369,4011.926436,222.025924,0.78 %,1.942131,0.265556,6.237815,2.825060


In [34]:
# Latest Day Data
ts = df['date'][df.shape[0]-1]
last_day_df = pd.DataFrame(df[df.date == ts])

In [35]:
last_day_df

Unnamed: 0,country,date,total_cases,recovered_cases,total_deaths,new_reported_cases,daily_recovered_cases,daily_deaths,active_cases,cases_growth_rate_in_percent,mortality_rate,recovery_rate,doubling_time_in_days,total_cases_per_million,active_cases_per_million,world_population_share,world_total_cases_share,world_active_cases_share,world_deaths_share,world_recovered_cases_share
171,Afghanistan,2020-07-11,34366,21135,994,172,253,23,12237,0.503012,2.892394,61.499738,95.971872,887.026791,315.851331,0.50 %,0.270333,0.237841,0.175922,0.301824
343,Albania,2020-07-11,3371,1881,89,93,6,4,1401,2.837096,2.640166,55.799466,27.130557,1171.128605,486.725356,0.04 %,0.026517,0.027230,0.015752,0.026862
515,Algeria,2020-07-11,18712,13124,1004,470,0,8,4584,2.576472,5.365541,70.136811,25.762907,428.333187,104.931559,0.56 %,0.147194,0.089096,0.177692,0.187421
687,Andorra,2020-07-11,855,803,52,0,0,0,0,0.000000,6.081871,93.918129,inf,11069.394096,0.000000,0.00 %,0.006726,0.000000,0.009203,0.011467
859,Angola,2020-07-11,462,118,23,4,1,0,321,0.873362,4.978355,25.541126,16.158390,14.152338,9.833118,0.42 %,0.003634,0.006239,0.004071,0.001685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31131,Sao Tome and Principe,2020-07-11,727,284,14,0,0,0,429,0.000000,1.925722,39.064649,438.238019,3330.157392,1965.113509,0.00 %,0.005719,0.008338,0.002478,0.004056
31303,Yemen,2020-07-11,1389,642,365,9,12,1,382,0.652174,26.277898,46.220302,45.267821,46.787819,12.867492,0.38 %,0.010926,0.007425,0.064599,0.009168
31475,Comoros,2020-07-11,317,296,7,3,24,0,14,0.955414,2.208202,93.375394,189.491723,366.179352,16.171959,0.01 %,0.002494,0.000272,0.001239,0.004227
31647,Tajikistan,2020-07-11,6506,5176,55,49,61,0,1275,0.758866,0.845374,79.557332,88.522346,685.394605,134.318801,0.12 %,0.051178,0.024781,0.009734,0.073917


In [36]:
df.to_csv(r'final_data/covid_time_series.csv')

In [37]:
cumulative.to_csv(r'final_data/covid_cumulative.csv')

In [38]:
population.to_csv(r'final_data/population_data.csv')