In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook wants to see if it can find relations between the covid-19 deaths in a country and features 
that reflect the countries public-health policies like immunization, health expenditures (both government and private) , Mamaria, tuberculosis, smoking, poverty, number of undernourished, GDP.
Also population density and % of people living in urban areas.  
  
According to Our World in Data:
https://ourworldindata.org/coronavirus#deaths-due-to-covid-19

Quote: 

    1. the actual total death toll from COVID-19 is likely to be higher than the number of confirmed deaths – this is due to limited testing and problems in the attribution of the cause of death; the difference between reported confirmed deaths and total deaths varies by country  
    2. how COVID-19 deaths are recorded may differ between countries (e.g. some countries may only count hospital deaths, whilst others have started to include deaths in homes)  
    3. the reported death figures on a given date does not necessarily show the number of new deaths on that day: this is due to delays in reporting.

# 1. load data from CORONAVIRUS GOVERNMENT RESPONSE TRACKER from University of Oxford

In [None]:
#Link to data of CORONAVIRUS GOVERNMENT RESPONSE TRACKER from University of Oxford
link = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

# df_data1 = pd.read_csv('OxCGRT_latest.csv', usecols=[0,2], keep_default_na=True, dtype='str')
df_data1 = pd.read_csv(link, usecols=[0,2], keep_default_na=True, dtype='str')

# df_data2 = pd.read_csv('OxCGRT_latest.csv', usecols=[4,5,7,9,11,13,15,17,19,34], keep_default_na=True)
df_data2 = pd.read_csv(link, usecols=[5,6,8,10,12,14,16,18,20,35], keep_default_na=True)

print(df_data2.isna().sum())

df_data2 = df_data2.fillna(0)
print('df_data2.isna().sum()', df_data2.isna().sum())

df_data2 = df_data2.astype('int32')

print('df_data2.describe()',df_data2.describe())

df_data = pd.concat([df_data1, df_data2], axis=1)
print('df_data.shape',df_data.shape)

df_data['Date_'] = pd.to_datetime(df_data.Date, format='%Y%m%d', errors='ignore')
del df_data['Date']

df_data.rename(columns={'Date_':'Date'}, inplace=True)

df_data_Belgium = df_data[df_data['CountryName']=='France']
df_data_Belgium.plot(x='Date',y=[1,3,4,5,6,7,8,9], figsize=(20,5))
plt.title('France')
plt.show()

df_data_Belgium = df_data[df_data['CountryName']=='Belgium']

df_data_Belgium.plot(x='Date',y=[1,3,4,5,6,7,8,9], figsize=(20,5))
plt.title('Belgium')
plt.show()

df_data_Sweden = df_data[df_data['CountryName']=='Sweden']

df_data_Sweden.plot(x='Date',y=[1,3,4,5,6,7,8,9], figsize=(20,5))
plt.title('Sweden')
plt.show()

df_data_US = df_data[df_data['CountryName']=='United States']

df_data_US.plot(x='Date',y=[1,3,4,5,6,7,8,9], figsize=(20,5))
plt.title('United States')
plt.show()

In [None]:
# collecting the data as one feature in a new df_data_control with country as index and 'control_score' and 'first_action_delay'
# cumulate the Closing and Controls (C1..C8) per country
# for countries with Regions cumulate and divide by total number of Regions per Country
# access the number of days between the first case and the start of the first restriction C2/C4/C5/C6 any come first 
def count_restrictions(df):
    '''
    input df holds alls data of one country
    country: country we are counting
    output: 
    '''
    
    restrictions = df.columns.values[2:-2]
    control_score = 0
    for C in restrictions:
        control_score += df[C].sum()
    
    if len(df[df['ConfirmedCases'] > 0][0:1].Date.ravel()) > 0:

        date_of_fist_case = df[df['ConfirmedCases'] > 0][0:1].Date.ravel()[0]
        times = []
        for C in restrictions:
            #print(C)
            if len(df[df[C] >=2][0:1].Date.ravel()) == 0:
                #print(C, 'zero')
                break
            else:
                date = df[df[C] >=2][0:1].Date.ravel()[0]
            #print(date)
            diff = date - date_of_fist_case
            pd.to_timedelta([diff]).astype('timedelta64[h]')[0] # diff = diff.astype('timedelta64[D]')
            times.append(diff/np.timedelta64(1,'D'))
        times = np.asanyarray(times)
        if len(times) == 0:
            first_action_delay = 365.0
        else:
            first_action_delay = times.min()
    else:
        first_action_delay = 365.0
            
    return (first_action_delay, control_score )
    

# collecting the data as one feature in a new df_data_control with country as index and 'control_score' and 'first_action_delay'
# cumulate the Closing and Controls (C1..C8) per country
# for countries with Regions cumulate and divide by total number of Regions per Country
# access the number of days between the first case and the start of the first restriction C2/C4/C5/C6 any come first 

country_as_index = df_data.CountryName.unique()
df_data_control = pd.DataFrame(index=country_as_index, columns=['first_action_delay', 'control_score'])

for ci in country_as_index:
    #print(ci)
    df_country = df_data[df_data.CountryName == ci ].copy()
    if len(df_country.RegionName.unique()) == 1:
        first_action_delay, control_score = count_restrictions(df_country)
        if first_action_delay < 0:
            first_action_delay = 365
        df_data_control.at[ci, 'first_action_delay'] = first_action_delay
        df_data_control.at[ci, 'control_score'] = control_score
        
    else:
        print(ci)
        regions = df_country.RegionName.unique()
        print(len(regions))
        first_action_delays = []
        control_scores = []
        numberof_regions = len(regions)
        for region in regions:
            df_region = df_data[df_data.RegionName == region].copy()
            df_region['CountryName'] = df_region.RegionName
            first_action_delay, control_score = count_restrictions(df_region)
            first_action_delays.append(first_action_delay)
            control_scores.append(control_score)
        cs = (np.asanyarray(control_scores)).mean()
        control_score = cs
        first_action_delay = (np.asanyarray(first_action_delays)).mean()
        df_data_control.at[ci, 'first_action_delay'] = first_action_delay
        df_data_control.at[ci, 'control_score'] = control_score

df_data_control.head()

In [None]:
df_data_control = df_data_control.astype('float32')
df_data_control.info()

In [None]:
print(df_data_control[df_data_control.index == 'United States'])
print()
print(df_data_control[df_data_control.index == 'Belgium'])

# load Covid-19 data 

In [None]:

# file_corona = './novel-corona-virus-2019-dataset/time_series_covid19_deaths_global.csv'
link_deaths_raw = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'

df_corona_2 = pd.read_csv(link_deaths_raw)
df_corona_2.head(2)

# sum up all provinces in one country and only the last date in a column

temp = df_corona_2.iloc[:,[1,-1]].copy()
df_corona = temp.groupby(temp['Country/Region'], as_index=True).sum().copy()
df_corona.head()
#df_corona = df_corona.drop('1/23/20', axis=1)
#df_corona['Country'] = df_corona.index
df_corona.head()

# rename 'Country/Region' in Country
df_corona.reset_index(inplace=True)
df_corona.rename(columns={'Country/Region':'Country'}, inplace=True)
df_corona.head(3)

# Load the data from Wold Bank

In [None]:
# load all_health data from World Bank
file = '/kaggle/input/covid19correlationswithdatafromworldbank/World_Bank_Data_all_health.csv'
df_health = pd.read_csv(file)
df_health.head(3)

In [None]:
# the total number of different data items are:

print('Total number of different data items:' ,len(df_health['Series Code'].unique()))
print('different data items:', df_health['Series Name'].unique())

# 2. Load the data from Wold Bank

In [None]:
# load all_health data from World Bank
file = '/kaggle/input/covid19correlationswithdatafromworldbank/World_Bank_Data_all_health.csv'
df_health = pd.read_csv(file)
df_health.head(3)

In [None]:
# the total number of different data items are:

print('Total number of different data items:' ,len(df_health['Series Code'].unique()))
print('different data items:', df_health['Series Name'].unique())

# The country names in both DataFrames are not the same
Make a dict with names to be changed as keys 

In [None]:
# dict with key = country name in World Bank total and value = country name in  data_corona

dict_country_names_data_corona = {
    'Bahamas, The' : 'Bahamas',
    'Brunei Darussalam' : 'Brunei',
    'Congo, Rep.' : 'Congo (Brazzaville)',
    'Congo, Dem. Rep.'   : 'Congo (Kinshasa)',
    'Czech Republic' : 'Czechia',
    'Egypt, Arab Rep.' : 'Egypt',
    'Gambia, The' : 'Gambia',
    'Iran, Islamic Rep.' : 'Iran',
    'Korea, Rep.' : 'Korea, South',
    'Kyrgyz Republic' : 'Kyrgyzstan',
    'Lao PDR' : 'Laos',
    'Russian Federation' : 'Russia',
    'Slovak Republic' : 'Slovakia',
    'Syrian Arab Republic' : 'Syria',    
    'United States' : 'US',
    'Venezuela, RB' : 'Venezuela',
    'Yemen, Rep.' : 'Yemen'
        }

In [None]:
def change_name(df, country, dictionary):
    '''
    Canges the name from country column in df following the dict
    input:
    df:  DataFrame where country names have to be changed
    country: the column with the country names to be changed
    dictionary: the dic with old and nuw names
    
    output:
    returns the changed dict with the column 'Country' holding the correct names
    returns a list with the changed names
    
    '''

    list_of_changed_countries = []
    df['Country'] = df[country]
    for index in range(df.shape[0]):
        country_old = df.Country.iloc[index]
        if country_old in dictionary.keys():
            new_country = dictionary[country_old]
            df.Country.iat[index] = new_country
            list_of_changed_countries.append(country_old)
            
    return df, list_of_changed_countries

In [None]:
# changing the country names
df_health , the_old_names = change_name(df_health, 'Country', dict_country_names_data_corona)
#print(the_old_names)
print(len(dict_country_names_data_corona), len(the_old_names))

# Merge health data with covid-19 death cases

In [None]:
def add_corona_deaths(df_1, df_totals):
    '''
    op basis van 'Country' en 'Series Code' wordt df_1 en df_2 samengevoegd in df_totals
    input = df_1 (df_corona), df_2 (df_health), df_totals
    output = df_totals
    '''
    Series_Codes = df_totals['Series Code'].unique()
    Countries = df_1['Country'].values
    index = 0
    while index < df_totals.shape[0]:
        for code in Series_Codes[:]:
            if df_totals['Series Code'].loc[index] == code:
                country = df_totals['Country'].loc[index]
                last_date = df_1.columns[1]
                deaths = df_1[df_1['Country']==country][last_date].ravel()
                #print(deaths)
                if deaths.size > 0:
                    df_totals['deaths'].iat[index] = deaths[0]
                    #print(df_totals['deaths'].loc[index])
        index +=1   
    return df_totals

In [None]:
df_totals = df_health.copy()
df_totals['deaths'] = -1.0
df_totals = add_corona_deaths(df_corona, df_totals)

In [None]:
df_totals.sort_values(['Series Code', 'Country'], axis='index', inplace=True)
df_totals = df_totals[df_totals['deaths'] >= 0. ].copy()
df_totals.reset_index(inplace=True)
del df_totals['index']

In [None]:
indexes_tot_drop = df_totals[df_totals.Value.isna() ].index.ravel()
print(df_totals.shape)
df_totals = df_totals.drop(indexes_tot_drop, axis='index')
print(df_totals.shape)

In [None]:
df_totals.sort_values(['Series Code', 'Country'], axis='index', inplace=True)
df_totals.reset_index(inplace=True)
del df_totals['index']
df_totals

In [None]:
# what are the series code

all_Series_Codes = df_totals['Series Code'].unique()
print(len(all_Series_Codes))

In [None]:
# find all indexes that belong to the 'Series Code' per code
dict_S_Code_indexes = {}
for code in all_Series_Codes:
    df_temp = df_totals[df_totals['Series Code'] == code]
    dict_S_Code_indexes[code] = df_temp.index.ravel()
    
    

In [None]:
for key, value in dict_S_Code_indexes.items():
    print('Series Code:', key, 'number of countries:', len(value))

In [None]:
for country in df_totals.Country.unique():
    dummy = (df_totals[df_totals.Country == country]['Series Code']=='SP.POP.TOTL')
    index = dummy[dummy].index.ravel()
    if len(index) > 0:
        index= index[0]
        print(index, ';',country, 'population:', df_totals[index:index+1].Value.ravel()[0])
    else:
        print(country, df_totals[df_totals.Country== country])

In [None]:
# add a column 'deaths_pp' : number of deaths per 100.000 inhabitants
def ad_deaths_per_pop(df):
    countries = df['Country'].unique()
    for country in countries:
        if country == 'Eritrea':
            population = 6081196. # population of Eritrea is not in the df
        else:
            dummy = (df[df_totals.Country == country]['Series Code']=='SP.POP.TOTL')
            index = dummy[dummy].index.ravel()
            if len(index) > 0:
                index= index[0]
                population = df[index:index+1].Value.ravel()[0]
            
        dummy = (df[df.Country == country]['Series Code']=='SP.POP.TOTL')
            
        deaths = df[df.Country == country]['deaths'].values[0]
        population_pp = deaths*100000/(population)
        indexes = df[df.Country == country].index .ravel()  
        df.at[indexes, 'deaths_pp'] = population_pp
        
    return df

In [None]:
# # add a column 'deaths_pp' : number of deaths per 100.000 inhabitants
df_totals['deaths_pp'] = -100.
df_totals = ad_deaths_per_pop(df_totals)
df_totals.head(20)

In [None]:
# normalize unnormalized series : 
# Smoking prevalence, total, ages 15+ : SH.PRV.SMOK
# Gross domestic product 2019 (millions of US dollars)   :   GDP_USdollars
# Population ages 65 and above, total     : SP.POP.65UP.TO
series_to_normalize = ['GDP_USdollars', 'SP.POP.65UP.TO']


for index in df_totals.index:
    if df_totals['Series Code'].loc[index] in series_to_normalize:
        scode = df_totals['Series Code'].loc[index]
        country = df_totals.Country.loc[index]
        dummy = (df_totals[df_totals.Country == country]['Series Code'] == 'SP.POP.TOTL')
        if len(dummy[dummy].index.ravel()) > 0:
            index_pop = dummy[dummy].index.ravel()[0]
            population = df_totals.Value.loc[index_pop]
            value = df_totals.Value.loc[index]
            #print(value)
            new_value = value*100 / population
            #print(new_value)
        else:
            new_value = 0.
        df_totals['Value'].at[index] = new_value


import seaborn as sns
#sns.set_theme(style="ticks")
#A = dict_S_Code_indexes[all_Series_Codes[0]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[27]].ravel()
#C = dict_S_Code_indexes[all_Series_Codes[29]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[30]].ravel()
E = dict_S_Code_indexes[all_Series_Codes[31]].ravel()
indexes = np.concatenate((B,D,E), axis=0)


# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

plt.savefig('corona_health_exp_3.jpg')

In [None]:
import seaborn as sns
#sns.set_theme(style="ticks")
A = dict_S_Code_indexes[all_Series_Codes[1]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[2]].ravel()
C = dict_S_Code_indexes[all_Series_Codes[3]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[4]].ravel()
E = dict_S_Code_indexes[all_Series_Codes[5]].ravel()
F = dict_S_Code_indexes[all_Series_Codes[6]].ravel()
G = dict_S_Code_indexes[all_Series_Codes[7]].ravel()
indexes = np.concatenate((A,B,C,D,E,F,G), axis=0)

# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

# plt.savefig('corona_health_exp2.jpg')

In [None]:
import seaborn as sns
#sns.set_theme(style="ticks")
A = dict_S_Code_indexes[all_Series_Codes[8]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[9]].ravel()
C = dict_S_Code_indexes[all_Series_Codes[10]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[11]].ravel()
E = dict_S_Code_indexes[all_Series_Codes[12]].ravel()
F = dict_S_Code_indexes[all_Series_Codes[13]].ravel()
G = dict_S_Code_indexes[all_Series_Codes[14]].ravel()
H = dict_S_Code_indexes[all_Series_Codes[15]].ravel()

indexes = np.concatenate((A,B,C,D,E,F,G,H), axis=0)


# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

# plt.savefig('corona_health_exp2.jpg')

In [None]:
import seaborn as sns
#sns.set_theme(style="ticks")
E = dict_S_Code_indexes[all_Series_Codes[0]].ravel()
F = dict_S_Code_indexes[all_Series_Codes[29]].ravel()
A = dict_S_Code_indexes[all_Series_Codes[16]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[17]].ravel()
C = dict_S_Code_indexes[all_Series_Codes[18]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[19]].ravel()
indexes = np.concatenate((A,B,C,D,E,F), axis=0)


# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

# plt.savefig('corona_health_exp2.jpg')

In [None]:
import seaborn as sns
#sns.set_theme(style="ticks")
A = dict_S_Code_indexes[all_Series_Codes[20]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[21]].ravel()
C = dict_S_Code_indexes[all_Series_Codes[22]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[23]].ravel()
E = dict_S_Code_indexes[all_Series_Codes[24]].ravel()
F = dict_S_Code_indexes[all_Series_Codes[25]].ravel()
indexes = np.concatenate((A,B,C,D,E,F), axis=0)


# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

# plt.savefig('corona_health_exp2.jpg')

In [None]:
import seaborn as sns
#sns.set_theme(style="ticks")
A = dict_S_Code_indexes[all_Series_Codes[26]].ravel()
B = dict_S_Code_indexes[all_Series_Codes[28]].ravel()
C = dict_S_Code_indexes[all_Series_Codes[32]].ravel()
D = dict_S_Code_indexes[all_Series_Codes[33]].ravel()
indexes = np.concatenate((A,B,C,D), axis=0)


# Show the results of a linear regression within each dataset
pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
           scatter_kws={"s": 50, "alpha": 1})

# plt.savefig('corona_health_exp2.jpg')

# Make Series in colmuns

In [None]:
def make_df_wide_in_columns(df, code_col):
    '''
    takes in a df with codes in one culumn and puts each column as a group into one new column
    input: df with series codes in 'code_col'
    output: df_wide
    
    '''
    df_wide = df.copy()
    max_size = 0
    for code in all_Series_Codes:
        size = len(dict_S_Code_indexes[code])
        if size > max_size:
            max_size = size
            
    all_Series_Codes_list = all_Series_Codes.tolist()
    all_Series_Codes_list.append('Country')
    all_Series_Codes_list.append('deaths')
    all_Series_Codes_list.append('deaths_pp')
    
    df_wide = pd.DataFrame( index=range(max_size+1), 
                           columns=all_Series_Codes_list,
                          dtype='float')
    df_wide.Country = df_wide.Country.astype('str')
    #print(df_wide.shape, df_wide.columns)
    df_wide = df_wide.fillna(-1.)
    index_wide = 0
    for country in df.Country.unique(): 
        country_indexes = df[df.Country == country].index.ravel()
        df_dummy = df.loc[country_indexes]
        #print('df_dummy.shape', df_dummy.shape)
        for index_dummy in df_dummy.index:
            #print(index_dummy)
            series_code = df_dummy['Series Code'].loc[index_dummy]
            value = df_dummy['Value'].loc[index_dummy]
        
            df_wide.at[index_wide, series_code] = value
            df_wide.at[index_wide, 'deaths'] = df['deaths'].loc[index_dummy]
            df_wide.at[index_wide, 'deaths_pp'] = df['deaths_pp'].loc[index_dummy]
            df_wide.at[index_wide, 'Country'] = country
            
        index_wide += 1
                
    return df_wide

In [None]:
# make wide df
df_totals_wide = make_df_wide_in_columns(df_totals, 'Series Code')
df_totals_wide.head()

In [None]:
print('MAX Corona_deaths_per_capita*100000', df_totals_wide['deaths_pp'].max() )
print('MIN Corona_deaths_per_capita*100000' , df_totals_wide['deaths_pp'].min()  )
print('Mean Corona_deaths_per_capita*100000' , df_totals_wide['deaths_pp'].mean()  )

In [None]:
# sort on deaths_pp ascending order
df_totals_wide = df_totals_wide.sort_values(['deaths_pp'], axis=0)
df_totals_wide.tail()

In [None]:
df_totals_wide = df_totals_wide.reset_index()
del df_totals_wide['index']

In [None]:
y = df_totals_wide['deaths_pp'].values
x = df_totals_wide['Country'].values
plt.figure(figsize=(25,10))

plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20) 

plt.plot(x[-40:], y[-40:])
plt.grid()
plt.xticks(x[-40:], rotation=90)
plt.show()

In [None]:
#import matplotlib
#import matplotlib.pyplot as plt
#import numpy as np


y = df_totals_wide['deaths_pp'].values
x = df_totals_wide['Country'].values
y = y[:]
x = x[:]

plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
width = 0.8  # the width of the bars

fig, ax = plt.subplots(figsize=(20,40))
rects1 = ax.barh(x, y, width)

# Add some text for labels, title and custom x-axis tick labels, etc.
#ax.set_ylabel('')
ax.set_title('Covid-19 deaths per 100000 inhabitants')
#ax.set_xticks(x)
#ax.set_xticklabels(x)

#ax.legend()


def autolabel(rects, y):
    """Attach a text label above each bar in *rects*, displaying its height."""
    label_nr = 0
    for rect in rects:
        #print(rect, rect.get_y())
        height = rect.get_height()
        ax.annotate('{:0.1f}'.format(y[label_nr]),
                    xy = ( rect.get_width() , rect.get_y() + height/2 ),
                    xytext=(20, -5),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        label_nr += 1


autolabel(rects1, y)
#autolabel(rects2)

#fig.tight_layout()
plt.savefig('Covid-19 deaths per 100000 inhabitants_2.png')
plt.show()

In [None]:
plt.hist(df_totals_wide['deaths_pp'].values, bins = 50)
plt.show()

In [None]:
# changing the country names
df_data_control.reset_index(inplace=True)
df_data_control = df_data_control.rename(columns={'index':'Country'})

In [None]:
df_data_control , the_old_names = change_name(df_data_control, 'Country', dict_country_names_data_corona)
#print(the_old_names)
print(len(dict_country_names_data_corona), len(the_old_names))

In [None]:
# make country the index
df_data_control.index = df_data_control.Country.values



In [None]:
# add control data from Oxford Univesity
df_totals_wide_control = df_totals_wide.copy()
df_totals_wide_control.index = df_totals_wide_control.Country.values
df_totals_wide_control['first_action_delay'] = 0.0
df_totals_wide_control['control_score'] = 0.0

for country in df_totals_wide_control.Country :
    try:
        first_action_delay = df_data_control.first_action_delay.loc[country]
        control_score = df_data_control.control_score.loc[country]
        df_totals_wide_control['first_action_delay'].at[country] = first_action_delay
        df_totals_wide_control['control_score'].at[country] = control_score
    except Exception as err:
        print(err, df_totals_wide_control.shape)
        df_totals_wide_control = df_totals_wide_control.drop(index=country, axis=index)
        
        pass

In [None]:
cols = list(df_totals_wide_control.columns.ravel())
_ = cols.remove('deaths')
_ = cols.remove('deaths_pp')

In [None]:
# make a dict with Series Codes as key and Series Name as value
dict_Series_Codes = {}
codes = df_health['Series Code'].unique()
names = df_health['Series Name'].unique()
for i in range(len(codes)):
    dict_Series_Codes[codes[i]] = names[i]
    
dict_Series_Codes['first_action_delay'] = 'number of days between first infection case and government action'
dict_Series_Codes['control_score'] = 'total sum of actions over the total period till now'

In [None]:
# display a heatmap of the correlations with covid_19_deaths_per_e5_capita
corr = df_totals_wide_control.loc[:,cols].corrwith(df_totals_wide_control.deaths_pp)
data_corr = pd.DataFrame(corr)
data_corr.sort_values([0],ascending=False ,inplace=True)

f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(data_corr[:], square=True, annot=True)
plt.title('Correlations with covid-19 deaths per 100,000')
plt.show()



### There are 3 features that have a correlation with death (lager than .4)
### URBAN_POP, SH.XPD.GHED.GE.ZS, SH.XPD.GHED.GD.ZS

In [None]:
df_corr_high = data_corr[data_corr > .4].dropna()[:]
df_corr_high

In [None]:

print('features that have a significant correlation (> .4) with deaths:')
high_corr = data_corr[data_corr > .4].dropna()[:].index
for high in high_corr:
    print(high)
    print('code:', high, 'name:', dict_Series_Codes[high], 'corr. : %.2f'% df_corr_high.loc[high][0])



In [None]:
# "spearman" correlations

corr = df_totals_wide_control.loc[:,cols].corrwith(df_totals_wide_control.deaths_pp, method='spearman')
data_corr = pd.DataFrame(corr)
data_corr.sort_values([0],ascending=False ,inplace=True)

f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(data_corr[:], square=True, annot=True)
plt.title('Correlations with covid-19 deaths per 100,000')
plt.show()




In [None]:
# spearman : positive relation
df_corr_high = data_corr[data_corr > .4].dropna()[:]
df_corr_high

In [None]:
# spearman : negative relation
df_corr_high = data_corr[data_corr < -.4].dropna()[:]
df_corr_high

In [None]:
df_totals_wide_control.reset_index(inplace=True)
del df_totals_wide_control['index']
_ = cols.remove('Country')

In [None]:
from catboost import CatBoost
model = CatBoost()
X_train = df_totals_wide_control.loc[:,cols]
y_train = df_totals_wide_control.loc[:,'deaths_pp'].ravel()

fit = model.fit(X=X_train, y=y_train, verbose=False )

In [None]:
f_importance = model.feature_importances_.ravel()

y = X_train.columns.values
plt.figure(figsize=(5,10))
plt.barh(y,f_importance)
plt.show()

In [None]:
df_feature_importance = pd.DataFrame(f_importance, index=y)
df_feature_importance = df_feature_importance.sort_values(by=[0] , axis=0, ascending=False)
df_feature_importance

In [None]:
for index in range(len(df_feature_importance)):
    #print(index, type(index))
    if df_feature_importance.iloc[index][0] < 2:
        print('feature %s importance is Lower than 2' %df_feature_importance.index[index])
        print('importance = %.2f :' %df_feature_importance.iloc[index][0], dict_Series_Codes[df_feature_importance.index[index]])
        print()
    else:
        print('feature %s importance is Higher than 2' %df_feature_importance.index[index])
        print('importance = %.2f :' %df_feature_importance.iloc[index][0], dict_Series_Codes[df_feature_importance.index[index]])
        print()

In [None]:
df_totals_wide_control.index = df_totals_wide_control.Country


In [None]:
# make list of indexes of the columns except Country and covid-19-deaths covid_19_deaths_per_e5_capita
cat_features = [] 
list_of_all_columns = list(df_totals_wide_control.columns.ravel())

list_not_include = ['Country', 'deaths', 'deaths_pp']

for col in list_of_all_columns:
    if col in list_not_include:
        pass
    else:
        cat_features.append(list_of_all_columns.index(col))
cat_features.sort()
print(cat_features)
print(list_of_all_columns)

In [None]:
# run CatBoostRegressor over all countries except one

from catboost import CatBoostRegressor
# run Catboost model 
df_result = pd.DataFrame(df_totals_wide_control.deaths_pp)
#df_result = df_result.drop(index='deatsh_pp')

df_result['prediction'] = 0.0
counter = 0
for country in df_totals_wide_control.index  :
    dummy = df_totals_wide_control.copy()
    #print('dummy.shape', dummy.shape)
    dummy = dummy.drop(index=country)
    #print('dummy.shape', dummy.shape)
    X = dummy.iloc[:,cat_features]
    #print('X.shape', X.shape)
    y = dummy.deaths_pp.values
    #print('len(y)',len(y))
    model = CatBoostRegressor(random_seed=20)
    model.fit(X,y, verbose=False, plot=False, )
    X_test = df_totals_wide_control.loc[country][cat_features].values
    #print('X_test.shape', X_test.shape)
    predict = model.predict(X_test)
    df_result.prediction.at[country] = predict
    if counter %2 == 0:
        print(counter, ',', end="")
    counter +=1

In [None]:
plt.figure(figsize=(20,7))
X = df_result.index.values
y1 = df_result.deaths_pp.values
y2 = df_result.prediction.values
plt.scatter(X,y1, label='covid 19 deaths per 100 000')
plt.scatter(X,y2, label='prediction')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
df_result.prediction.mean(), df_result.prediction.std(), df_result.prediction.max(), df_result.prediction.min()

In [None]:
plt.hist(df_result.prediction, bins=50)
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error
print('MAE : ', mean_absolute_error(df_result.deaths_pp, df_result.prediction))

from sklearn.metrics import r2_score
print('R2 score: ',r2_score(df_result.deaths_pp, df_result.prediction))


def mean_absolute_percentage_error(y_true, y_pred): 
    
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / (y_true+1))) * 100

print('MAPE',mean_absolute_percentage_error(df_result.deaths_pp, df_result.prediction))

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model_etr = ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=1496, verbose=0, warm_start=False)

df_result_etr = pd.DataFrame(df_totals_wide_control.deaths_pp)

df_result_etr['prediction'] = 0.0
counter = 0
for country in df_totals_wide_control.index[:]  :
    dummy = df_totals_wide_control.copy()
    #print('dummy.shape', dummy.shape)
    dummy = dummy.drop(index=country)
    #print('dummy.shape', dummy.shape)
    X = dummy.iloc[:,cat_features]
    #print('X.shape', X.shape)
    y = dummy.deaths_pp.ravel()
    y = y.reshape(-1,1)
    #print(y.shape)
    #print('len(y)',len(y))
    model_etr.fit(X,y)
    X_test = df_totals_wide_control.loc[country][cat_features].ravel()
    #print(X_test.shape)
    X_test = X_test.reshape(1, len(X_test))
    #print('X_test.shape', X_test.shape)
    predict = model_etr.predict(X_test)
    df_result_etr.prediction.at[country] = predict
    if counter %3 == 0:
        print(counter, ',', end="")
    counter +=1

In [None]:
plt.figure(figsize=(20,10))
X = df_result_etr.index.values
y1 = df_result_etr.deaths_pp.values
y2 = df_result_etr.prediction.values
plt.scatter(X,y1, label='covid 19 deaths per 100 000')
plt.scatter(X,y2, label='prediction')
plt.xticks(rotation=90)
plt.grid()
plt.legend()
plt.show()

In [None]:
df_result_etr.prediction.mean(), df_result_etr.prediction.std(), df_result_etr.prediction.max(), df_result_etr.prediction.min()

In [None]:
plt.hist(df_result_etr.prediction, bins=50)
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error
print('MAE : ', mean_absolute_error(df_result_etr.deaths_pp, df_result_etr.prediction))

from sklearn.metrics import r2_score
print('R2 score: ',r2_score(df_result_etr.deaths_pp, df_result_etr.prediction))


def mean_absolute_percentage_error(y_true, y_pred): 
    
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / (y_true+1))) * 100

mean_absolute_percentage_error(df_result_etr.deaths_pp, df_result_etr.prediction)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model_etr = ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=5,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=8, oob_score=False,
                    random_state=1496, verbose=0, warm_start=True)

df_result_etr = pd.DataFrame(df_totals_wide_control.deaths_pp)

df_result_etr['prediction'] = 0.0
counter = 0
for country in df_totals_wide_control.index[:]  :
    dummy = df_totals_wide_control.copy()
    #print('dummy.shape', dummy.shape)
    dummy = dummy.drop(index=country)
    #print('dummy.shape', dummy.shape)
    X = dummy.iloc[:,cat_features]
    #print('X.shape', X.shape)
    y = dummy.deaths_pp.ravel()
    y = y.reshape(-1,1)
    #print(y.shape)
    #print('len(y)',len(y))
    model_etr.fit(X,y)
    X_test = df_totals_wide_control.loc[country][cat_features].ravel()
    #print(X_test.shape)
    X_test = X_test.reshape(1, len(X_test))
    #print('X_test.shape', X_test.shape)
    predict = model_etr.predict(X_test)
    df_result_etr.prediction.at[country] = predict
    if counter %3 == 0:
        print(counter, ',', end="")
    counter +=1

In [None]:
plt.figure(figsize=(20,7))
X = df_result_etr.index.values
y1 = df_result_etr.deaths_pp.values
y2 = df_result_etr.prediction.values
plt.scatter(X,y1, label='covid 19 deaths per 100 000')
plt.scatter(X,y2, label='prediction')
plt.xticks(rotation=90)
plt.grid()
plt.legend()
plt.show()

In [None]:
df_result_etr.prediction.mean(), df_result_etr.prediction.std(), df_result_etr.prediction.max(), df_result_etr.prediction.min()

In [None]:
from sklearn.metrics import mean_absolute_error
print('MAE : ', mean_absolute_error(df_result_etr.deaths_pp, df_result_etr.prediction))

from sklearn.metrics import r2_score
print('R2 score: ',r2_score(df_result_etr.deaths_pp, df_result_etr.prediction))


def mean_absolute_percentage_error(y_true, y_pred): 
    
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / (y_true+1))) * 100

MAPE = mean_absolute_percentage_error(df_result_etr.deaths_pp, df_result_etr.prediction)
print('MAPE', MAPE)