This notebook wants to see if it can find relations between the covid-19 deaths in a country and features 
that reflect the countries public-health policies like immunization, health expenditures (both government and private) , malaria, tuberculosis, smoking, poverty, number of undernourished, GDP.
Also population density and % of people living in urban areas.  
  
According to Our World in Data:
https://ourworldindata.org/coronavirus#deaths-due-to-covid-19
Quote: 
    1. the actual total death toll from COVID-19 is likely to be higher than the number of confirmed deaths – this is due to limited testing and problems in the attribution of the cause of death; the difference between reported confirmed deaths and total deaths varies by country  
    2. how COVID-19 deaths are recorded may differ between countries (e.g. some countries may only count hospital deaths, whilst others have started to include deaths in homes)  
    3. the reported death figures on a given date does not necessarily show the number of new deaths on that day: this is due to delays in reporting.

Data from World Bank is CC 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. load Covid-19 data from  
https://www.kaggle.com/themlphdstudent/novel-covid19-dataset



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_covid_19 = '/kaggle/input/novel-covid19-dataset/time_series_covid19_deaths_global.csv'
df_data_covid_19 = pd.read_csv(file_covid_19)
df_data_covid_19.head(2)

In [None]:
# sum up all provinces in one country and only the last date in a column

test = df_data_covid_19.copy()
df_data_covid_19 = test.groupby('Country/Region', as_index=False).sum()
df_data_covid_19 = df_data_covid_19.iloc[:,[0,1,2,-1]].copy()
print(df_data_covid_19.iloc[0,[-1]])
df_data_covid_19 = df_data_covid_19.rename(columns={'Country/Region':'Country', '10/30/20':'deaths'})
df_data_covid_19

# 2. Load the data from Wold Bank

In [None]:
# health data
# /kaggle/input/covid19correlationswithdatafromworldbank/World_Bank_Data_all_health.csv
path = '/kaggle/input/covid19correlationswithdatafromworldbank'
file = 'World_Bank_Data_all_health.csv'
healthFile = path + '/' + file
df_health = pd.read_csv(healthFile)

In [None]:
df_health.head(3)

In [None]:
# the total number of different data are:

len(df_health['Series Code'].unique())

# 3. the country names in World Data and Covid-19 data are not the same
Make a dict with names to be changed as keys 

In [None]:
# dict with key = country name in World Bank total and value = country name in df_data_covid_19

dict_country_names_data_corona = {
    'Bahamas, The' : 'Bahamas',
    'Brunei Darussalam' : 'Brunei',
    'Congo, Rep.' : 'Congo (Brazzaville)',
    'Congo, Dem. Rep.'   : 'Congo (Kinshasa)',
    'Czech Republic' : 'Czechia',
    'Egypt, Arab Rep.' : 'Egypt',
    'Gambia, The' : 'Gambia',
    'Iran, Islamic Rep.' : 'Iran',
    'Korea, Rep.' : 'Korea, South',
    'Kyrgyz Republic' : 'Kyrgyzstan',
    'Lao PDR' : 'Laos',
    'Russian Federation' : 'Russia',
    'Slovak Republic' : 'Slovakia',
    'Syrian Arab Republic' : 'Syria',    
    'United States' : 'US',
    'Venezuela, RB' : 'Venezuela',
    'Yemen, Rep.' : 'Yemen'
        }

In [None]:
def change_name(df, country, dictionary):
    '''
    Canges the name from country column in df following the dict
    input:
    df:  DataFrame where country names have to be changed
    country: the column with the country names to be changed
    dictionary: the dic with old and nuw names
    
    output:
    returns the changed dict with the column 'Country' holding the correct names
    returns a list with the changed names
    
    '''

    list_of_changed_countries = []
    df['Country'] = df[country]
    for index in range(df.shape[0]):
        country_old = df.Country.iloc[index]
        if country_old in dictionary.keys():
            new_country = dictionary[country_old]
            df.Country.iat[index] = new_country
            list_of_changed_countries.append(country_old)
            
    return df, list_of_changed_countries

In [None]:
# changing the country names
df_health , the_old_names = change_name(df_health, 'Country', dict_country_names_data_corona)
#print(the_old_names)
print(len(dict_country_names_data_corona), len(the_old_names))

In [None]:
print(df_health['Series Code'].unique())

In [None]:
print(df_health['Series Code'].isna().sum())

In [None]:
df_health.shape

In [None]:
df_health = df_health.sort_values('Series Code')

In [None]:
df_health.tail()

In [None]:
df_health = df_health.reset_index()
del df_health['index']
df_health.tail()

In [None]:
df_health = df_health[:6011].copy()
df_health

In [None]:
# test country US

df_health[df_health.Country == 'US']

# Collect all data into one df

In [None]:
# make a dict with Series Codes as key and Series Name as value
dict_healt_codes = {}
codes = df_health['Series Code'].unique()
names = df_health['Series Name'].unique()
for i in range(len(codes)):
    dict_healt_codes[codes[i]] = names[i]
    

In [None]:
# example:
dict_healt_codes['SH.IMM.HEPB']

In [None]:
def concat_df(df1, df2, df2_column):
    '''
    adds the df2_column from df2 to df1 based on 'Country'
    input: df1: base df
        df2 : df to take column to add to df1
    output df = df1 + df2_column
    '''
    df = df1.copy()
    df[df2_column] = 0.
    index1 = 0
    index2 = 0
    while index1 < df.shape[0]:
        country = df['Country'].iloc[index1]
        #print(country)
        index2 = 0
        while index2 < df2.shape[0]:
            country2 = df2['Country'].iloc[index2]
            #print(country2)
            if country == df2['Country'].iloc[index2]:
                if df2[df2_column].iloc[index2] != np.nan:
                    df[df2_column].iat[index1] = df2[df2_column].iloc[index2]
                    #print( df2.Country.iloc[index2] , df2[df2_column].iloc[index2])
            index2 += 1
        #print(index1)    
        index1 += 1
    
    return df.copy()

In [None]:
codes

In [None]:

df_temp = df_data_covid_19.copy()
df_temp.shape

In [None]:

df_temp = df_health[df_health['Series Code']== codes[6]]
df_temp.shape

In [None]:
len(df_health.Country.unique()), len(df_health['Country Code'].unique())

In [None]:
# make columns in df from codes in df_health


for code in codes[:]:
    print(code)
    df_temp = df_health[df_health['Series Code']== code]
    df_data_covid_19 = concat_df(df_data_covid_19, df_temp , 'Value')
    print('df_data_covid_19.shape', df_data_covid_19.shape)
    df_data_covid_19[code] = df_data_covid_19['Value']
    #print('df_data_covid_19.columns', df_data_covid_19.columns)
del df_data_covid_19['Value'] # is a temporary column

In [None]:
df_data_covid_19.head()

In [None]:
# SP.POP.TOTL is the country population total
df_data_covid_19  = df_data_covid_19.rename(columns={'SP.POP.TOTL':'Population'})


In [None]:
df_data_covid_19.shape

In [None]:
df_data_covid_19

In [None]:
df_data_covid_19[df_data_covid_19['SURF.AREA'].isna()]


In [None]:
df_data_covid_19_Backup = df_data_covid_19.copy()

In [None]:
print(df_data_covid_19.shape)
# drop the rows where Surface is NaN
df_data_drop = df_data_covid_19[df_data_covid_19['SURF.AREA'].isna()]
#df_World_data_rows_to_drop
#df_World_data_rows_to_drop.index

df_data_covid_19 = df_data_covid_19.drop(df_data_drop.index, axis='index')
df_data_covid_19.shape

In [None]:
# where is population == 0 ?
zero_population = df_data_covid_19[df_data_covid_19.Population == 0]

df_data_covid_19 = df_data_covid_19.drop(zero_population.index, axis='index')
df_data_covid_19.shape

In [None]:
# save this df
df_data_covid_19.to_csv('World_data_health_GDP_surface_urban_population.csv', index=False)

In [None]:
# calculate the relative deaths per 100000 inhabitants
df_data_covid_19['covid_19_deaths_per_e5_capita'] = 100000 * df_data_covid_19['deaths'] / (df_data_covid_19.Population)


In [None]:
print('MAX covid_19_deaths_per_e5_capita', df_data_covid_19['covid_19_deaths_per_e5_capita'].max() )
print('MIN covid_19_deaths_per_e5_capita' , df_data_covid_19['covid_19_deaths_per_e5_capita'].min()  )
print('Mean covid_19_deaths_per_e5_capita' , df_data_covid_19['covid_19_deaths_per_e5_capita'].mean()  )

In [None]:
# sort Corona_deaths_per_capita*100000 ascending order
df_data_covid_19 = df_data_covid_19.sort_values(['covid_19_deaths_per_e5_capita'], axis=0)
df_data_covid_19.reset_index(inplace=True)
del df_data_covid_19['index']

In [None]:
# make a bar chart
import matplotlib
import matplotlib.pyplot as plt
import numpy as np


y = df_data_covid_19['covid_19_deaths_per_e5_capita'].values
x = df_data_covid_19['Country'].values
y = y[:]
x = x[:]

plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
width = 0.8  # the width of the bars

fig, ax = plt.subplots(figsize=(20,40))
rects1 = ax.barh(x, y, width)

# Add some text for labels, title and custom x-axis tick labels, etc.
#ax.set_ylabel('')
ax.set_title('Covid-19 deaths per 100000 inhabitants')

def autolabel(rects, y):
    """Attach a text label above each bar in *rects*, displaying its height."""
    label_nr = 0
    for rect in rects:
        #print(rect, rect.get_y())
        height = rect.get_height()
        ax.annotate('{:0.1f}'.format(y[label_nr]),
                    xy = ( rect.get_width() , rect.get_y() + height/2 ),
                    xytext=(20, -5),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        label_nr += 1


autolabel(rects1, y)

plt.savefig('Covid-19 deaths per 100000 inhabitants_3.png')
plt.show()

In [None]:
more_than_70_deaths_prt = df_data_covid_19[df_data_covid_19.covid_19_deaths_per_e5_capita > 70.]
more_than_70_deaths_prt

In [None]:
less_than_20_deaths_prt = df_data_covid_19[df_data_covid_19.covid_19_deaths_per_e5_capita < 20.]
less_than_20_deaths_prt

In [None]:
# correltions with deaths
import seaborn as sns # sns.set_theme()
data = df_data_covid_19.copy()
del data['Country']

uniform_data = data.iloc[:,1:].astype('float')
#uniform_data.info()

In [None]:
# display a heatmap of the correlations with covid_19_deaths_per_e5_capita
corr = uniform_data.corrwith(df_data_covid_19.covid_19_deaths_per_e5_capita)
data_corr = pd.DataFrame(corr)
data_corr.sort_values([0],ascending=False ,inplace=True)

f, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(data_corr, square=True, annot=True)
plt.title('Correlations with covid-19 deaths per 100,000')
plt.show()


In [None]:
# have a increasing impact on deaths:
data_increase = data_corr[data_corr[0] > .1]
print('Data that increase the death cases')
print()

for index in data_increase.index:
    if index[0:2] == 'SH':
        print(dict_healt_codes[index])
    elif index[0:2] != 'co':
        print(index)

In [None]:
# data that decrease the deaths
# have a increasing impact on deaths:
data_decrease = data_corr[data_corr[0] < -.1]
print('Data that decrease the death cases')
print()

for index in data_decrease.index:
    if index[0:2] == 'SH':
        print(dict_healt_codes[index])
    elif index[0:2] != 'co':
        print(index)


# what features are important


In [None]:
# first make features normalizes (per capita for GDP and per square km for surface)
# GDP_per_capita
df_data_covid_19['GDP_per_capita'] = df_data_covid_19['GDP_USdollars'] / df_data_covid_19.Population

# Population per square km
df_data_covid_19['Pop_pskm'] = df_data_covid_19.Population / df_data_covid_19['SURF.AREA']
df_data_covid_19.columns



In [None]:
dict_healt_codes['URBAN_POP']

In [None]:
# make index = Country for easy reference
df_data_covid_19
df_data_covid_19.index = df_data_covid_19.Country

In [None]:
df_data_covid_19.columns

In [None]:
# make a df with na absolute values : Population, GDP_USdollars, SURF.AREA, deaths
df_World_covid_19_data = df_data_covid_19.drop(columns=['Population', 'GDP_USdollars', 'SURF.AREA', 'deaths'])
y = df_World_covid_19_data.covid_19_deaths_per_e5_capita.values
y_backup = y
del df_World_covid_19_data['covid_19_deaths_per_e5_capita']
# make list of indexes of the columns except Country and covid-19-deaths covid_19_deaths_per_e5_capita
cat_features = [] 
list_of_all_columns = list(df_World_covid_19_data.columns)
list_of_columns = list(set(list_of_all_columns[1:29]) | set(list_of_all_columns[31:]) )
for col in list_of_columns:
    cat_features.append(list_of_all_columns.index(col))
cat_features.sort()
print(cat_features)
print(list_of_columns)

In [None]:
X = df_World_covid_19_data.iloc[:,cat_features]


In [None]:
# using random train and validation set
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X , y , test_size=0.33, random_state=10)


In [None]:
# using CatBoost as model
from catboost import CatBoostRegressor, Pool
eval_dataset = Pool(X_valid, y_valid)
model1 = CatBoostRegressor(iterations=1000)
                           
history = model1.fit(X_train, y_train, eval_set=eval_dataset,
                     use_best_model=True, verbose=False, plot=False)

In [None]:
f_importance = model1.feature_importances_.ravel()

y = X_train.columns.values
plt.figure(figsize=(5,7))
barh = plt.barh(y,f_importance)
plt.show()

In [None]:
import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model1)
shap_values_train = explainer.shap_values(Pool(X_train, y_train))
print(shap_values_train.shape , X_train.shape)
shap.force_plot(explainer.expected_value, shap_values_train, X_train)



In [None]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("URBAN_POP", shap_values_train, X_train)

    

In [None]:
dict_healt_codes['SH.MED.NUMW.P3']

In [None]:


# looking at The countries with the highest deaths per capita

In [None]:
df_World_covid_19_data.loc[['Belgium','France','Netherlands','Germany', 'San Marino', 'Peru']].T

In [None]:
urban_pop_pct = 'URBAN_POP'
df_World_covid_19_data['covid_19_deaths_per_e5_capita'] = y_backup
colmumns_urban = ['Country', 'Pop_pskm', 'covid_19_deaths_per_e5_capita', urban_pop_pct]
df_urban = df_World_covid_19_data[colmumns_urban].copy()
df_urban[urban_pop_pct].plot(kind = 'hist', title='histogram of percentage of people living in urban areas')
plt.show()

In [None]:
x = df_urban.URBAN_POP.values
y = df_urban.covid_19_deaths_per_e5_capita.values
plt.scatter(x,y)
plt.xlabel('Percentage of people living in urban areas')
plt.ylabel('Covid-19 deaths per 100.000 inhabitants')
plt.title('Percentage of people living in urban areas vs. percentage of covid-19 deaths')
plt.show()

In [None]:
# probability that Belgium has high deaths
df_World_covid_19_data



In [None]:
X_test = df_World_covid_19_data.loc['US'][cat_features].values

predict = model1.predict(X_test)

predict

In [None]:
predict.min()

In [None]:
# fit a model on all data
model2 = CatBoostRegressor(iterations=1000)
history2 = model2.fit(X, y, verbose=False, plot=False)

In [None]:
predict = model2.predict(X)
predict.max()

In [None]:
X_test = df_World_covid_19_data.loc['US'][cat_features].values

predict = model2.predict(X_test)

print('prediction',  predict)
print('actual:', df_World_covid_19_data.loc['US']['covid_19_deaths_per_e5_capita'])

In [None]:
X_test = df_World_covid_19_data.loc['Peru'][cat_features].values

predict = model2.predict(X_test)

print('prediction',  predict)
print('actual:', df_World_covid_19_data.loc['Peru']['covid_19_deaths_per_e5_capita'])

In [None]:
X_test = df_World_covid_19_data.loc['Belgium'][cat_features].values

predict = model2.predict(X_test)

print('prediction',  predict)
print('actual:', df_World_covid_19_data.loc['Belgium']['covid_19_deaths_per_e5_capita'])

# training model on all data exept one country, then predicting that country and save the result in a df

In [None]:
# make a backup copy
df_World_covid_19_data_backup = df_World_covid_19_data.copy()

In [None]:

df_result = pd.DataFrame(df_World_covid_19_data.covid_19_deaths_per_e5_capita)

In [None]:
df_result.index

In [None]:
# run Catboost model 
df_result = pd.DataFrame(df_World_covid_19_data.covid_19_deaths_per_e5_capita)
#df_result = df_result.drop(index='covid_19_deaths_per_e5_capita')

df_result['prediction'] = 0.0
counter = 0
for country in df_World_covid_19_data.index[:]  :
    dummy = df_World_covid_19_data.copy()
    #print('dummy.shape', dummy.shape)
    dummy = dummy.drop(index=country)
    #print('dummy.shape', dummy.shape)
    X = dummy.iloc[:,cat_features]
    #print('X.shape', X.shape)
    y = dummy.covid_19_deaths_per_e5_capita.values
    #print('len(y)',len(y))
    model = CatBoostRegressor(random_seed=20)
    model.fit(X,y, verbose=False, plot=False, )
    X_test = df_World_covid_19_data.loc[country][cat_features].values
    #print('X_test.shape', X_test.shape)
    predict = model.predict(X_test)
    df_result.prediction.at[country] = predict
    if counter %10 == 0:
        print(counter, ': ',df_result.loc[country])
    counter +=1

In [None]:
plt.figure(figsize=(20,10))
X = df_result.index.values
y1 = df_result.covid_19_deaths_per_e5_capita.values
y2 = df_result.prediction.values
plt.scatter(X,y1, label='covid_19_deaths_per_e5_capita')
plt.scatter(X,y2, label='prediction')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
df_result['pct_error'] = 100*(df_result.prediction \
                              - df_result.covid_19_deaths_per_e5_capita) \
                            /(df_result.covid_19_deaths_per_e5_capita + 1.)
                                                                                               
                                                                                                                                                                      
df_result = df_result.astype('float')                                                                                               
                                                                                               

In [None]:
df_result[abs(df_result.pct_error) < 5.]

# If some of you can help me with some advice, it would be much appreciated. I am not an expert and I know a lot of you are.