# Data Cleaning

## 1. Introduccion

## 2. Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## 3. Creacion de `df` utilizando datos de `CoreCode` en `data_core/`

### 3.1. Preparacion dataset `confirmed_global.csv`

In [2]:
url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
df1 = pd.read_csv(url_confirmed_global)
#df1 = pd.read_csv('data_core/confirmed_global.csv')

In [3]:
# El analisis se va a hacer por pais, no por provincia de modo que elimino la columna 'Province/State'. Las columnas de 'Lat' y 'Long' 
# se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el 'groupby'.

df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

In [4]:
# Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos 
# los casos por pais que anteriormente estaban subdivididos por 'Province/State'.

# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [5]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/20/21,12/21/21,12/22/21,12/23/21,12/24/21,12/25/21,12/26/21,12/27/21,12/28/21,12/29/21
16,Austria,0,0,0,0,0,0,0,0,0,...,1251433,1253961,1256230,1258377,1260751,1262836,1264553,1266103,1268519,1271770


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/25/21          1262836
12/26/21          1264553
12/27/21          1266103
12/28/21          1268519
12/29/21          1271770
Length: 709, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

<div align="center">
Confirmamos que el groupby se ha completado con exito
<div>

In [8]:
# Mergeamos las columnas de 'Date-Countrty' por cada pais y anadimos una columna con su valor correspondiente

# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/29/21Vietnam,Vietnam,12/29/21,1694874
12/29/21West Bank and Gaza,West Bank and Gaza,12/29/21,469748
12/29/21Yemen,Yemen,12/29/21,10125
12/29/21Zambia,Zambia,12/29/21,243638


### 3.2. Preparacion dataset `deaths_global.csv`

Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
df2 = pd.read_csv(url_deaths_global)
#df2 = pd.read_csv('data_core/deaths_global.csv')

df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/29/21Vietnam,Vietnam,12/29/21,31877
12/29/21West Bank and Gaza,West Bank and Gaza,12/29/21,4919
12/29/21Yemen,Yemen,12/29/21,1984
12/29/21Zambia,Zambia,12/29/21,3726


### 3.3. Preparacion dataset `recovered_global.csv`

Repetimos el mismo proceso anterior para el dataset `recovered_global.csv`

In [10]:
url_recovered_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv"
df3 = pd.read_csv(url_recovered_global)
#df3 = pd.read_csv('data_core/recovered_global.csv')


df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/29/21Vietnam,Vietnam,12/29/21,0
12/29/21West Bank and Gaza,West Bank and Gaza,12/29/21,0
12/29/21Yemen,Yemen,12/29/21,0
12/29/21Zambia,Zambia,12/29/21,0


### 3.4. Juntamos todos los dataframe `df1`, `df2` y `df3` en uno solo `df`

In [11]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

# Creo un sefundo dataframe final, mergeando el anterior dataframe (df_f1) y df3 por 'Date-Country'
df = pd.merge(df, df3 , how='left', on='Date-Country')


In [12]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)

# Reordeno las Columnas
df = df[['Country/Region','Date','Confirmed','Deaths','Recovered']]
df = df.rename(columns={'Country/Region':'Country'})
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
0,Afghanistan,1/22/20,0,0,0
1,Albania,1/22/20,0,0,0
2,Algeria,1/22/20,0,0,0
3,Andorra,1/22/20,0,0,0
4,Angola,1/22/20,0,0,0
...,...,...,...,...,...
138763,Vietnam,12/29/21,1694874,31877,0
138764,West Bank and Gaza,12/29/21,469748,4919,0
138765,Yemen,12/29/21,10125,1984,0
138766,Zambia,12/29/21,243638,3726,0


## 4. Anado datos geograficos y poblacion a `df`

### 4.1. Anado las columnas de `'Lat'` y `'Long'` al dataframe `df`

In [13]:
df4 = pd.read_csv("data_extra/concap.csv")
df4 = df4.drop(['CapitalName'], axis=1)
df4 = df4.drop_duplicates()

filter_continent = df4['ContinentName'] == 'Europe'
df4 = df4[filter_continent]

df4 = df4.rename(columns={'CountryName':'Country',
                          'CapitalLatitude':'Lat', 
                          'CapitalLongitude':'Long', 
                          'CountryCode':'geoId',
                          'ContinentName':'continentExp'})
df4.head(3)


Unnamed: 0,Country,Lat,Long,geoId,continentExp
4,Aland Islands,60.116667,19.9,AX,Europe
10,Albania,41.316667,19.816667,AL,Europe
13,Andorra,42.5,1.516667,AD,Europe


In [14]:
df = pd.merge(df, df4 , how='left', on='Country')

### 4.2. Filtrado de `df` por `'continentExP'`: `'Europe'`

Para poder aprovechar los dataset de data_extra, que estan centrados unicamente en Europa, y ademas poder centrar mejor el analisis, voy a filtrar el dataframe eliminando todos los paises que no son europeos.

In [15]:
# Filtro el dataframe (df) para paises europeos 
filter_europe = df['continentExp'] == 'Europe'
df = df[filter_europe]
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,continentExp
1,Albania,1/22/20,0,0,0,41.316667,19.816667,AL,Europe
3,Andorra,1/22/20,0,0,0,42.500000,1.516667,AD,Europe
7,Armenia,1/22/20,0,0,0,40.166667,44.500000,AM,Europe
9,Austria,1/22/20,0,0,0,48.200000,16.366667,AT,Europe
10,Azerbaijan,1/22/20,0,0,0,40.383333,49.866667,AZ,Europe
...,...,...,...,...,...,...,...,...,...
138741,Sweden,12/29/21,1303663,15297,0,59.333333,18.050000,SE,Europe
138742,Switzerland,12/29/21,1294592,12172,0,46.916667,7.466667,CH,Europe
138753,Turkey,12/29/21,9367369,81917,0,39.933333,32.866667,TR,Europe
138756,Ukraine,12/29/21,3833952,101548,0,50.433333,30.516667,UA,Europe


## 5. Modificacion del indice de `df` y creacion de columnas `'Year'`, `'Week'` y `'Day'`

### 5.1. Cambio de tipo de datos

In [16]:
df.dtypes

Country          object
Date             object
Confirmed         int64
Deaths            int64
Recovered         int64
Lat             float64
Long            float64
geoId            object
continentExp     object
dtype: object

In [17]:
# Cabia Date a tipo datetime
df['Date'] = pd.to_datetime(df.Date)

### 5.2. Extraccion de nuevas columnas a traves de `'Date'`

In [18]:
# Declaro variables
y = df['Date'].dt
x = df['Date'].dt.isocalendar().week.apply(np.int64)

# Creo nuevas columnas con tipo int64
df['Year'] = y.year
df['Month'] = y.month
df['Week'] = x
df['Week-Copy'] = x
df['Day'] = y.day

# comprobamos que efectivamente las columnas se han creado como int64
df.dtypes

Country                 object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
Recovered                int64
Lat                    float64
Long                   float64
geoId                   object
continentExp            object
Year                     int64
Month                    int64
Week                     int64
Week-Copy                int64
Day                      int64
dtype: object

In [19]:
# Para poder mergear con los data set que tienen informacion a nivel 'year'-'week'
# Necesito poner un '0' delante de las semanas que sean menores de 10
# Esta columna al tener el '-', obligatoriamente sera de tipo 'object.

def str_fixer(value):
    if int(value) < 10:
        return f'0{value}'
    else:
        return str(value)

df["Week-Copy"] = df["Week-Copy"].apply(str_fixer)


In [20]:
# Comprobamos que funciona correctamente
df["Week-Copy"].unique()

array(['04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14',
       '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47',
       '48', '49', '50', '51', '52', '53', '01', '02', '03'], dtype=object)

In [21]:
# Creo los id que me serviran para mergear el resto de datasets
df["Year-Week"] = df["Year"].apply(str) + "-" + df["Week-Copy"]
df["Year-Week-Copy"] = df["Year"].apply(str) + "-W" + df["Week-Copy"]

In [22]:
df.head(5)

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,continentExp,Year,Month,Week,Week-Copy,Day,Year-Week,Year-Week-Copy
1,Albania,2020-01-22,0,0,0,41.316667,19.816667,AL,Europe,2020,1,4,4,22,2020-04,2020-W04
3,Andorra,2020-01-22,0,0,0,42.5,1.516667,AD,Europe,2020,1,4,4,22,2020-04,2020-W04
7,Armenia,2020-01-22,0,0,0,40.166667,44.5,AM,Europe,2020,1,4,4,22,2020-04,2020-W04
9,Austria,2020-01-22,0,0,0,48.2,16.366667,AT,Europe,2020,1,4,4,22,2020-04,2020-W04
10,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,AZ,Europe,2020,1,4,4,22,2020-04,2020-W04


In [23]:
df.dtypes

Country                   object
Date              datetime64[ns]
Confirmed                  int64
Deaths                     int64
Recovered                  int64
Lat                      float64
Long                     float64
geoId                     object
continentExp              object
Year                       int64
Month                      int64
Week                       int64
Week-Copy                 object
Day                        int64
Year-Week                 object
Year-Week-Copy            object
dtype: object

## 6. Union de los dataset: `ICU_hospital.csv`, `test_rate.csv` y `vaccine_tracker.csv` con `df`

### 6.1. Creacion de `id` en `df` para el mergeo con los dataset de `data_extra`

In [24]:
#Borro Columnas sobrantes
df = df.drop(['continentExp'], axis=1)

# Reordeno las Columnas
#df = df[['Date','Country','geoId','Lat','Long','Year','Month','Week','Day','Confirmed','Deaths',
#         'Recovered','Week-Copy','Year-Week','Year-Week-Copy']]

# Defino las columnas que me serviran para mergear con otros dataset

# id para dataset de vaccine tracker
df['id-merge-vaccine'] = df['geoId'] + df['Year-Week-Copy']

# id para dataset de testrate
df['id-merge-test'] = df['Country'] + df['Year-Week-Copy']

# id Prara dataset de ICU
df['id-merge-icu'] = df['Country'] + df['Date'].apply(str)
df['id-merge-icu'] = df['id-merge-icu'].apply(lambda x: x.split(' ')[0])


### 6.2. Preparacion dataset `ICU_hospital.csv`

In [25]:
url_UCI = "https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv"
df_ex3 = pd.read_csv(url_UCI)
#df_ex3 = pd.read_csv('data_extra/ICU_hospital.csv')

In [26]:
df_ex3

Unnamed: 0,country,indicator,date,year_week,value,source,url
0,Austria,Daily hospital occupancy,2020-04-01,2020-W14,856.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
1,Austria,Daily hospital occupancy,2020-04-02,2020-W14,823.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
2,Austria,Daily hospital occupancy,2020-04-03,2020-W14,829.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
3,Austria,Daily hospital occupancy,2020-04-04,2020-W14,826.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
4,Austria,Daily hospital occupancy,2020-04-05,2020-W14,712.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
...,...,...,...,...,...,...,...
30930,Sweden,Weekly new ICU admissions per 100k,2021-11-28,2021-W47,0.338898,"TESSy COVID-19, national daily data",
30931,Sweden,Weekly new ICU admissions per 100k,2021-12-05,2021-W48,0.309850,"TESSy COVID-19, national daily data",
30932,Sweden,Weekly new ICU admissions per 100k,2021-12-12,2021-W49,0.445409,"TESSy COVID-19, national daily data",
30933,Sweden,Weekly new ICU admissions per 100k,2021-12-19,2021-W50,0.600334,"TESSy COVID-19, national daily data",


In [27]:
print((df_ex3.isnull().sum()/len(df_ex3))*100)

country       0.000000
indicator     0.000000
date          0.000000
year_week     0.000000
value         0.000000
source        0.000000
url          10.117989
dtype: float64


In [28]:
df_ex3['indicator'].unique()

array(['Daily hospital occupancy', 'Daily ICU occupancy',
       'Weekly new hospital admissions per 100k',
       'Weekly new ICU admissions per 100k'], dtype=object)

In [29]:
# Divido el dataset en 4 dataset por indicator y luego los uno en uno solo por columnas
df_ex3['id-merge-icu'] = df_ex3['country']+df_ex3['date']
df_ex3['id-merge-icu']

0        Austria2020-04-01
1        Austria2020-04-02
2        Austria2020-04-03
3        Austria2020-04-04
4        Austria2020-04-05
               ...        
30930     Sweden2021-11-28
30931     Sweden2021-12-05
30932     Sweden2021-12-12
30933     Sweden2021-12-19
30934     Sweden2021-12-26
Name: id-merge-icu, Length: 30935, dtype: object

In [30]:
# Borro columnas sobrantes
df_ex3 = df_ex3.drop(['year_week', 'source', 'url'], axis=1)

In [31]:
# Hospital_Occupancy

filter_uci = df_ex3['indicator'] == 'Daily hospital occupancy'
df_uci_1 = df_ex3[filter_uci]

# DEFINO EL DATAFRAME
df_uci_1 = df_uci_1.rename(columns={'value':'Hospital_Occupancy'})
df_uci_1 = df_uci_1.drop(['indicator','date','country'], axis=1)

df_uci_1

Unnamed: 0,Hospital_Occupancy,id-merge-icu
0,856.0,Austria2020-04-01
1,823.0,Austria2020-04-02
2,829.0,Austria2020-04-03
3,826.0,Austria2020-04-04
4,712.0,Austria2020-04-05
...,...,...
30171,526.0,Sweden2021-12-16
30172,524.0,Sweden2021-12-17
30173,513.0,Sweden2021-12-18
30174,515.0,Sweden2021-12-19


In [32]:
# ICU occupancy

filter_uci = df_ex3['indicator'] == 'Daily ICU occupancy'
df_uci_2 = df_ex3[filter_uci]

#DEFINO EL DATAFRAME
df_uci_2 = df_uci_2.rename(columns={'value':'ICU_Occupancy'})
df_uci_2 = df_uci_2.drop(['indicator','date','country'], axis=1)

df_uci_2

Unnamed: 0,ICU_Occupancy,id-merge-icu
629,215.0,Austria2020-04-01
630,219.0,Austria2020-04-02
631,245.0,Austria2020-04-03
632,245.0,Austria2020-04-04
633,244.0,Austria2020-04-05
...,...,...
30835,68.0,Sweden2021-12-17
30836,72.0,Sweden2021-12-18
30837,72.0,Sweden2021-12-19
30838,76.0,Sweden2021-12-20


In [33]:
df_ex3 = df_ex3.drop(['indicator','value','country'], axis=1)
df_ex3

Unnamed: 0,date,id-merge-icu
0,2020-04-01,Austria2020-04-01
1,2020-04-02,Austria2020-04-02
2,2020-04-03,Austria2020-04-03
3,2020-04-04,Austria2020-04-04
4,2020-04-05,Austria2020-04-05
...,...,...
30930,2021-11-28,Sweden2021-11-28
30931,2021-12-05,Sweden2021-12-05
30932,2021-12-12,Sweden2021-12-12
30933,2021-12-19,Sweden2021-12-19


In [34]:
df_ex3 = pd.merge(df_ex3, df_uci_1 , how='left', on='id-merge-icu')
df_ex3

Unnamed: 0,date,id-merge-icu,Hospital_Occupancy
0,2020-04-01,Austria2020-04-01,856.0
1,2020-04-02,Austria2020-04-02,823.0
2,2020-04-03,Austria2020-04-03,829.0
3,2020-04-04,Austria2020-04-04,826.0
4,2020-04-05,Austria2020-04-05,712.0
...,...,...,...
30930,2021-11-28,Sweden2021-11-28,293.0
30931,2021-12-05,Sweden2021-12-05,344.0
30932,2021-12-12,Sweden2021-12-12,451.0
30933,2021-12-19,Sweden2021-12-19,515.0


In [35]:
df_ex3 = pd.merge(df_ex3, df_uci_2 , how='left', on='id-merge-icu')
df_ex3

Unnamed: 0,date,id-merge-icu,Hospital_Occupancy,ICU_Occupancy
0,2020-04-01,Austria2020-04-01,856.0,215.0
1,2020-04-02,Austria2020-04-02,823.0,219.0
2,2020-04-03,Austria2020-04-03,829.0,245.0
3,2020-04-04,Austria2020-04-04,826.0,245.0
4,2020-04-05,Austria2020-04-05,712.0,244.0
...,...,...,...,...
30930,2021-11-28,Sweden2021-11-28,293.0,29.0
30931,2021-12-05,Sweden2021-12-05,344.0,46.0
30932,2021-12-12,Sweden2021-12-12,451.0,53.0
30933,2021-12-19,Sweden2021-12-19,515.0,72.0


In [36]:
df_ex3 = df_ex3.drop(columns=['date'])
df_ex3

Unnamed: 0,id-merge-icu,Hospital_Occupancy,ICU_Occupancy
0,Austria2020-04-01,856.0,215.0
1,Austria2020-04-02,823.0,219.0
2,Austria2020-04-03,829.0,245.0
3,Austria2020-04-04,826.0,245.0
4,Austria2020-04-05,712.0,244.0
...,...,...,...
30930,Sweden2021-11-28,293.0,29.0
30931,Sweden2021-12-05,344.0,46.0
30932,Sweden2021-12-12,451.0,53.0
30933,Sweden2021-12-19,515.0,72.0


### 6.3. Preapracion dataset `test_rate.csv`

In [37]:
url_test_rate = "https://opendata.ecdc.europa.eu/covid19/testing/csv/data.csv"
df_ex1 = pd.read_csv(url_test_rate)
#df_ex1 = pd.read_csv('data_extra/test_rate.csv')

In [38]:
df_ex1

Unnamed: 0,country,country_code,year_week,level,region,region_name,new_cases,tests_done,population,testing_rate,positivity_rate,testing_data_source
0,Austria,AT,2020-W15,national,AT,Austria,1838.0,12339,8901064.0,138.623877,14.895859,Manual webscraping
1,Austria,AT,2020-W16,national,AT,Austria,684.0,58488,8901064.0,657.089984,1.169471,Manual webscraping
2,Austria,AT,2020-W17,national,AT,Austria,448.0,33443,8901064.0,375.719128,1.339593,Manual webscraping
3,Austria,AT,2020-W18,national,AT,Austria,312.0,26598,8901064.0,298.818209,1.173021,Country website
4,Austria,AT,2020-W19,national,AT,Austria,264.0,42153,8901064.0,473.572598,0.626290,Country website
...,...,...,...,...,...,...,...,...,...,...,...,...
11878,Sweden,SE,2021-W47,national,SE,Sweden,11916.0,226289,10327589.0,2191.111594,5.265833,TESSy
11879,Sweden,SE,2021-W48,national,SE,Sweden,13800.0,273987,10327589.0,2652.961887,5.036735,TESSy
11880,Sweden,SE,2021-W49,national,SE,Sweden,18654.0,335956,10327589.0,3252.995447,5.552513,TESSy
11881,Sweden,SE,2021-W50,national,SE,Sweden,24542.0,386528,10327589.0,3742.674113,6.349346,TESSy


In [39]:
df_ex1['level'].unique()

array(['national', 'subnational'], dtype=object)

In [40]:
# Dentro del dataset exiten datos a nivel nacional y subnacional. Vamos a filtrar por nacional, para descartar 
# todas las lineas por provicia, ya que nuestro analisis es a nivel nacional en Europa.

filter_national = df_ex1['level'] == 'national'
df_ex1 = df_ex1[filter_national]

In [41]:
# Eliminamos columnas no necesarias 

df_ex1 = df_ex1.drop(['region_name', 'new_cases', 'testing_data_source','region_name','level','region'], axis=1)

In [42]:
df_ex1['id-merge-test'] = df_ex1['country'] + df_ex1['year_week']

df_ex1 = df_ex1.drop(['year_week', 'country_code','country'], axis=1)

df_ex1['population'] = df_ex1['population'].astype(int)


df_ex1

Unnamed: 0,tests_done,population,testing_rate,positivity_rate,id-merge-test
0,12339,8901064,138.623877,14.895859,Austria2020-W15
1,58488,8901064,657.089984,1.169471,Austria2020-W16
2,33443,8901064,375.719128,1.339593,Austria2020-W17
3,26598,8901064,298.818209,1.173021,Austria2020-W18
4,42153,8901064,473.572598,0.626290,Austria2020-W19
...,...,...,...,...,...
11878,226289,10327589,2191.111594,5.265833,Sweden2021-W47
11879,273987,10327589,2652.961887,5.036735,Sweden2021-W48
11880,335956,10327589,3252.995447,5.552513,Sweden2021-W49
11881,386528,10327589,3742.674113,6.349346,Sweden2021-W50


### 6.4. Preparacion dataset `vaccine_tracker.csv`

In [43]:
url_vaccine_tracker = "https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv"
df_ex2 = pd.read_csv(url_vaccine_tracker)
#df_ex2 = pd.read_csv('data_extra/vaccine_tracker.csv')
df_ex2


Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,UnknownDose,Region,TargetGroup,Vaccine,Population
0,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,MOD,8901064
1,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,JANSS,8901064
2,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,UNK,8901064
3,2020-W53,AT,8901064.0,61425.0,0.0,5243,,0,0,0,AT,ALL,COM,8901064
4,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,AZ,8901064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197511,2021-W51,SK,391090.0,0.0,0.0,23,,8,292,0,SK,Age70_79,MOD,5457873
197512,2021-W51,SK,391090.0,0.0,0.0,163,,201,1457,0,SK,Age70_79,COM,5457873
197513,2021-W51,SK,184680.0,0.0,0.0,1,,0,0,0,SK,Age80+,JANSS,5457873
197514,2021-W51,SK,184680.0,0.0,0.0,70,,76,451,0,SK,Age80+,COM,5457873


In [44]:
df_ex2['TargetGroup'].unique()

array(['ALL', 'Age0_4', 'Age10_14', 'Age15_17', 'Age18_24', 'Age25_49',
       'Age50_59', 'Age5_9', 'Age60_69', 'Age70_79', 'Age80+', 'Age<18',
       'AgeUNK', 'HCW', 'LTCF', '1_Age60+', '1_Age<60'], dtype=object)

In [45]:
'''
filter_target = df_ex2['TargetGroup'] != '1_Age<60'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != '1_Age60+'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'LTCF'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'HCW'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'AgeUNK'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'Age<18'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'ALL'
df_ex2 = df_ex2[filter_target]
'''
filter_target = df_ex2['TargetGroup'] == 'ALL'
df_ex2 = df_ex2[filter_target]

df_ex2['TargetGroup'].unique()

array(['ALL'], dtype=object)

In [46]:
# Borro columnas sobrantes para el merge
df_ex2 = df_ex2.drop(['Denominator', 'DoseAdditional1', 'UnknownDose','Population','Vaccine','TargetGroup'], axis=1)

# Defino el id del merge
df_ex2['id-merge-vaccine'] = df_ex2['ReportingCountry'] + df_ex2['YearWeekISO']

# Borro columnas sobrantes usadas para crear el id
df_ex2 = df_ex2.drop(['YearWeekISO', 'ReportingCountry', 'Region'], axis=1)


df_ex2 = df_ex2.groupby(['id-merge-vaccine']).sum().reset_index()

df_ex2

Unnamed: 0,id-merge-vaccine,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,AT2020-W53,61425.0,0.0,5243,0.0,0
1,AT2021-W01,61425.0,0.0,26181,0.0,0
2,AT2021-W02,68625.0,0.0,84934,0.0,398
3,AT2021-W03,58500.0,0.0,93267,0.0,4568
4,AT2021-W04,54990.0,0.0,31517,0.0,17516
...,...,...,...,...,...,...
1548,SK2021-W47,0.0,0.0,49023,0.0,6738
1549,SK2021-W48,0.0,0.0,26133,0.0,7134
1550,SK2021-W49,0.0,0.0,20317,0.0,14250
1551,SK2021-W50,0.0,0.0,16742,0.0,21223


### 6.5. Mergeo: Enriquecimiento de `df` con datadet: `test_rate.csv`, `vaccine_tracker.csv` y `ICU_hospital.csv`

In [47]:
df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long',
       'geoId', 'Year', 'Month', 'Week', 'Week-Copy', 'Day', 'Year-Week',
       'Year-Week-Copy', 'id-merge-vaccine', 'id-merge-test', 'id-merge-icu'],
      dtype='object')

In [48]:
# Mergeo con el dataset de test
df = pd.merge(df, df_ex1 , how='left', on='id-merge-test')
print((df.isnull().sum()/len(df))*100)
df

Country              0.000000
Date                 0.000000
Confirmed            0.000000
Deaths               0.000000
Recovered            0.000000
Lat                  0.000000
Long                 0.000000
geoId                0.000000
Year                 0.000000
Month                0.000000
Week                 0.000000
Week-Copy            0.000000
Day                  0.000000
Year-Week            0.000000
Year-Week-Copy       0.000000
id-merge-vaccine     0.000000
id-merge-test        0.000000
id-merge-icu         0.000000
tests_done          43.244380
population          43.244380
testing_rate        43.244380
positivity_rate     43.517851
dtype: float64


Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,Year,Month,...,Day,Year-Week,Year-Week-Copy,id-merge-vaccine,id-merge-test,id-merge-icu,tests_done,population,testing_rate,positivity_rate
0,Albania,2020-01-22,0,0,0,41.316667,19.816667,AL,2020,1,...,22,2020-04,2020-W04,AL2020-W04,Albania2020-W04,Albania2020-01-22,,,,
1,Andorra,2020-01-22,0,0,0,42.500000,1.516667,AD,2020,1,...,22,2020-04,2020-W04,AD2020-W04,Andorra2020-W04,Andorra2020-01-22,,,,
2,Armenia,2020-01-22,0,0,0,40.166667,44.500000,AM,2020,1,...,22,2020-04,2020-W04,AM2020-W04,Armenia2020-W04,Armenia2020-01-22,,,,
3,Austria,2020-01-22,0,0,0,48.200000,16.366667,AT,2020,1,...,22,2020-04,2020-W04,AT2020-W04,Austria2020-W04,Austria2020-01-22,,,,
4,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,AZ,2020,1,...,22,2020-04,2020-W04,AZ2020-W04,Azerbaijan2020-W04,Azerbaijan2020-01-22,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33271,Sweden,2021-12-29,1303663,15297,0,59.333333,18.050000,SE,2021,12,...,29,2021-52,2021-W52,SE2021-W52,Sweden2021-W52,Sweden2021-12-29,,,,
33272,Switzerland,2021-12-29,1294592,12172,0,46.916667,7.466667,CH,2021,12,...,29,2021-52,2021-W52,CH2021-W52,Switzerland2021-W52,Switzerland2021-12-29,,,,
33273,Turkey,2021-12-29,9367369,81917,0,39.933333,32.866667,TR,2021,12,...,29,2021-52,2021-W52,TR2021-W52,Turkey2021-W52,Turkey2021-12-29,,,,
33274,Ukraine,2021-12-29,3833952,101548,0,50.433333,30.516667,UA,2021,12,...,29,2021-52,2021-W52,UA2021-W52,Ukraine2021-W52,Ukraine2021-12-29,,,,


In [49]:
df = pd.merge(df, df_ex2 , how='left', on='id-merge-vaccine')
print((df.isnull().sum()/len(df))*100)
df

Country                 0.000000
Date                    0.000000
Confirmed               0.000000
Deaths                  0.000000
Recovered               0.000000
Lat                     0.000000
Long                    0.000000
geoId                   0.000000
Year                    0.000000
Month                   0.000000
Week                    0.000000
Week-Copy               0.000000
Day                     0.000000
Year-Week               0.000000
Year-Week-Copy          0.000000
id-merge-vaccine        0.000000
id-merge-test           0.000000
id-merge-icu            0.000000
tests_done             43.244380
population             43.244380
testing_rate           43.244380
positivity_rate        43.517851
NumberDosesReceived    69.743960
NumberDosesExported    69.743960
FirstDose              69.743960
FirstDoseRefused       69.743960
SecondDose             69.743960
dtype: float64


Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,Year,Month,...,id-merge-icu,tests_done,population,testing_rate,positivity_rate,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,Albania,2020-01-22,0,0,0,41.316667,19.816667,AL,2020,1,...,Albania2020-01-22,,,,,,,,,
1,Andorra,2020-01-22,0,0,0,42.500000,1.516667,AD,2020,1,...,Andorra2020-01-22,,,,,,,,,
2,Armenia,2020-01-22,0,0,0,40.166667,44.500000,AM,2020,1,...,Armenia2020-01-22,,,,,,,,,
3,Austria,2020-01-22,0,0,0,48.200000,16.366667,AT,2020,1,...,Austria2020-01-22,,,,,,,,,
4,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,AZ,2020,1,...,Azerbaijan2020-01-22,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33271,Sweden,2021-12-29,1303663,15297,0,59.333333,18.050000,SE,2021,12,...,Sweden2021-12-29,,,,,,,,,
33272,Switzerland,2021-12-29,1294592,12172,0,46.916667,7.466667,CH,2021,12,...,Switzerland2021-12-29,,,,,,,,,
33273,Turkey,2021-12-29,9367369,81917,0,39.933333,32.866667,TR,2021,12,...,Turkey2021-12-29,,,,,,,,,
33274,Ukraine,2021-12-29,3833952,101548,0,50.433333,30.516667,UA,2021,12,...,Ukraine2021-12-29,,,,,,,,,


In [50]:
df = pd.merge(df, df_ex3 , how='left', on='id-merge-icu')
print((df.isnull().sum()/len(df))*100)
df

Country                 0.000000
Date                    0.000000
Confirmed               0.000000
Deaths                  0.000000
Recovered               0.000000
Lat                     0.000000
Long                    0.000000
geoId                   0.000000
Year                    0.000000
Month                   0.000000
Week                    0.000000
Week-Copy               0.000000
Day                     0.000000
Year-Week               0.000000
Year-Week-Copy          0.000000
id-merge-vaccine        0.000000
id-merge-test           0.000000
id-merge-icu            0.000000
tests_done             32.122802
population             32.122802
testing_rate           32.122802
positivity_rate        32.339614
NumberDosesReceived    62.233593
NumberDosesExported    62.233593
FirstDose              62.233593
FirstDoseRefused       62.233593
SecondDose             62.233593
Hospital_Occupancy     42.135160
ICU_Occupancy          47.245409
dtype: float64


Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,Year,Month,...,population,testing_rate,positivity_rate,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,0,41.316667,19.816667,AL,2020,1,...,,,,,,,,,,
1,Andorra,2020-01-22,0,0,0,42.500000,1.516667,AD,2020,1,...,,,,,,,,,,
2,Armenia,2020-01-22,0,0,0,40.166667,44.500000,AM,2020,1,...,,,,,,,,,,
3,Austria,2020-01-22,0,0,0,48.200000,16.366667,AT,2020,1,...,,,,,,,,,,
4,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,AZ,2020,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46118,Sweden,2021-12-29,1303663,15297,0,59.333333,18.050000,SE,2021,12,...,,,,,,,,,,
46119,Switzerland,2021-12-29,1294592,12172,0,46.916667,7.466667,CH,2021,12,...,,,,,,,,,,
46120,Turkey,2021-12-29,9367369,81917,0,39.933333,32.866667,TR,2021,12,...,,,,,,,,,,
46121,Ukraine,2021-12-29,3833952,101548,0,50.433333,30.516667,UA,2021,12,...,,,,,,,,,,


In [51]:
df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long',
       'geoId', 'Year', 'Month', 'Week', 'Week-Copy', 'Day', 'Year-Week',
       'Year-Week-Copy', 'id-merge-vaccine', 'id-merge-test', 'id-merge-icu',
       'tests_done', 'population', 'testing_rate', 'positivity_rate',
       'NumberDosesReceived', 'NumberDosesExported', 'FirstDose',
       'FirstDoseRefused', 'SecondDose', 'Hospital_Occupancy',
       'ICU_Occupancy'],
      dtype='object')

In [52]:
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,Year,Month,...,population,testing_rate,positivity_rate,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,0,41.316667,19.816667,AL,2020,1,...,,,,,,,,,,
1,Andorra,2020-01-22,0,0,0,42.500000,1.516667,AD,2020,1,...,,,,,,,,,,
2,Armenia,2020-01-22,0,0,0,40.166667,44.500000,AM,2020,1,...,,,,,,,,,,
3,Austria,2020-01-22,0,0,0,48.200000,16.366667,AT,2020,1,...,,,,,,,,,,
4,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,AZ,2020,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46118,Sweden,2021-12-29,1303663,15297,0,59.333333,18.050000,SE,2021,12,...,,,,,,,,,,
46119,Switzerland,2021-12-29,1294592,12172,0,46.916667,7.466667,CH,2021,12,...,,,,,,,,,,
46120,Turkey,2021-12-29,9367369,81917,0,39.933333,32.866667,TR,2021,12,...,,,,,,,,,,
46121,Ukraine,2021-12-29,3833952,101548,0,50.433333,30.516667,UA,2021,12,...,,,,,,,,,,


## 7. Dataframe final `df`

### 7.1. Limpieza de columnas y sort de `df`

In [53]:
df = df.drop(['id-merge-test', 'Year-Week-Copy', 'Week-Copy', 'geoId','id-merge-vaccine','id-merge-icu','SecondDose'], axis=1)
df = df.rename(columns={'NumberDosesReceived':'DosesReceived',
                        'NumberDosesExported':'DosesExported'})

df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long',
       'Year', 'Month', 'Week', 'Day', 'Year-Week', 'tests_done', 'population',
       'testing_rate', 'positivity_rate', 'DosesReceived', 'DosesExported',
       'FirstDose', 'FirstDoseRefused', 'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

In [54]:
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,Year,Month,Week,...,tests_done,population,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,0,41.316667,19.816667,2020,1,4,...,,,,,,,,,,
1,Andorra,2020-01-22,0,0,0,42.500000,1.516667,2020,1,4,...,,,,,,,,,,
2,Armenia,2020-01-22,0,0,0,40.166667,44.500000,2020,1,4,...,,,,,,,,,,
3,Austria,2020-01-22,0,0,0,48.200000,16.366667,2020,1,4,...,,,,,,,,,,
4,Azerbaijan,2020-01-22,0,0,0,40.383333,49.866667,2020,1,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46118,Sweden,2021-12-29,1303663,15297,0,59.333333,18.050000,2021,12,52,...,,,,,,,,,,
46119,Switzerland,2021-12-29,1294592,12172,0,46.916667,7.466667,2021,12,52,...,,,,,,,,,,
46120,Turkey,2021-12-29,9367369,81917,0,39.933333,32.866667,2021,12,52,...,,,,,,,,,,
46121,Ukraine,2021-12-29,3833952,101548,0,50.433333,30.516667,2021,12,52,...,,,,,,,,,,


### 7.3. Rellenando valores nulos

In [55]:
# con este filtro voy a eliminar varios paises que no aparecen directamente en mis datos enriquecidos 
filter_Confirmed_0 = df['population'] > 0
df = df[filter_Confirmed_0]
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,Year,Month,Week,...,tests_done,population,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,Hospital_Occupancy,ICU_Occupancy
13,Finland,2020-01-22,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,,,,,,
42,Sweden,2020-01-22,0,0,0,59.333333,18.050000,2020,1,4,...,11.0,10327589.0,0.106511,0.000000,,,,,,
60,Finland,2020-01-23,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,,,,,,
89,Sweden,2020-01-23,0,0,0,59.333333,18.050000,2020,1,4,...,11.0,10327589.0,0.106511,0.000000,,,,,,
107,Finland,2020-01-24,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45973,Slovakia,2021-12-26,1349806,16398,0,48.150000,17.116667,2021,12,51,...,230975.0,5457873.0,4231.959959,8.625176,0.0,0.0,1308.0,0.0,,
45974,Slovenia,2021-12-26,456057,5543,0,46.050000,14.516667,2021,12,51,...,531191.0,2095861.0,25344.762844,1.429994,,,,,,
45975,Spain,2021-12-26,5718007,89019,0,40.400000,-3.683333,2021,12,51,...,2288693.0,47332614.0,4835.340385,15.356057,,,,,,
45976,Spain,2021-12-26,5718007,89019,0,40.400000,-3.683333,2021,12,51,...,2288693.0,47332614.0,4835.340385,15.356057,,,,,,


In [56]:
print((df.isnull().sum()/len(df))*100)
print(df.shape)

Country                0.000000
Date                   0.000000
Confirmed              0.000000
Deaths                 0.000000
Recovered              0.000000
Lat                    0.000000
Long                   0.000000
Year                   0.000000
Month                  0.000000
Week                   0.000000
Day                    0.000000
Year-Week              0.000000
tests_done             0.000000
population             0.000000
testing_rate           0.000000
positivity_rate        0.319417
DosesReceived         44.386239
DosesExported         44.386239
FirstDose             44.386239
FirstDoseRefused      44.386239
Hospital_Occupancy    17.328393
ICU_Occupancy         24.473760
dtype: float64
(31307, 22)


In [57]:

df['positivity_rate'] = df['positivity_rate'].fillna(0)

df['FirstDose'] = df['FirstDose'].fillna(0)
df['DosesReceived'] = df['DosesReceived'].fillna(0)
df['DosesExported'] = df['DosesExported'].fillna(0)
df['FirstDoseRefused'] = df['FirstDoseRefused'].fillna(0)

df['Hospital_Occupancy'] = df['Hospital_Occupancy'].fillna(0)
df['ICU_Occupancy'] = df['ICU_Occupancy'].fillna(0)

print((df.isnull().sum()/len(df))*100)
print(df.shape)



Country               0.0
Date                  0.0
Confirmed             0.0
Deaths                0.0
Recovered             0.0
Lat                   0.0
Long                  0.0
Year                  0.0
Month                 0.0
Week                  0.0
Day                   0.0
Year-Week             0.0
tests_done            0.0
population            0.0
testing_rate          0.0
positivity_rate       0.0
DosesReceived         0.0
DosesExported         0.0
FirstDose             0.0
FirstDoseRefused      0.0
Hospital_Occupancy    0.0
ICU_Occupancy         0.0
dtype: float64
(31307, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['positivity_rate'] = df['positivity_rate'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['FirstDose'] = df['FirstDose'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DosesReceived'] = df['DosesReceived'].fillna(0)
A value is trying to be set on a copy of a slice from

In [58]:
df = df.drop_duplicates()
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,Year,Month,Week,...,tests_done,population,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,Hospital_Occupancy,ICU_Occupancy
13,Finland,2020-01-22,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
42,Sweden,2020-01-22,0,0,0,59.333333,18.050000,2020,1,4,...,11.0,10327589.0,0.106511,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
60,Finland,2020-01-23,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
89,Sweden,2020-01-23,0,0,0,59.333333,18.050000,2020,1,4,...,11.0,10327589.0,0.106511,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
107,Finland,2020-01-24,0,0,0,60.166667,24.933333,2020,1,4,...,21.0,5525292.0,0.380070,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45969,Romania,2021-12-26,1802745,58551,0,44.433333,26.100000,2021,12,51,...,219409.0,19328838.0,1135.138077,2.016326,0.0,0.0,0.0,0.0,0.0,0.0
45973,Slovakia,2021-12-26,1349806,16398,0,48.150000,17.116667,2021,12,51,...,230975.0,5457873.0,4231.959959,8.625176,0.0,0.0,1308.0,0.0,0.0,0.0
45974,Slovenia,2021-12-26,456057,5543,0,46.050000,14.516667,2021,12,51,...,531191.0,2095861.0,25344.762844,1.429994,0.0,0.0,0.0,0.0,0.0,0.0
45975,Spain,2021-12-26,5718007,89019,0,40.400000,-3.683333,2021,12,51,...,2288693.0,47332614.0,4835.340385,15.356057,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
df['Country'].unique()

array(['Finland', 'Sweden', 'Croatia', 'Denmark', 'Ireland', 'Slovakia',
       'Estonia', 'Malta', 'Latvia', 'Lithuania', 'Belgium', 'France',
       'Greece', 'Iceland', 'Italy', 'Luxembourg', 'Norway', 'Portugal',
       'Slovenia', 'Cyprus', 'Germany', 'Hungary', 'Poland',
       'Netherlands', 'Romania', 'Austria', 'Liechtenstein', 'Bulgaria',
       'Spain'], dtype=object)

In [60]:
print((df.isnull().sum()/len(df))*100)


Country               0.0
Date                  0.0
Confirmed             0.0
Deaths                0.0
Recovered             0.0
Lat                   0.0
Long                  0.0
Year                  0.0
Month                 0.0
Week                  0.0
Day                   0.0
Year-Week             0.0
tests_done            0.0
population            0.0
testing_rate          0.0
positivity_rate       0.0
DosesReceived         0.0
DosesExported         0.0
FirstDose             0.0
FirstDoseRefused      0.0
Hospital_Occupancy    0.0
ICU_Occupancy         0.0
dtype: float64


In [61]:
df = df.sort_values(['Country','Date'], ascending=[True, True])
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,Year,Month,Week,...,tests_done,population,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,Hospital_Occupancy,ICU_Occupancy
4002,Austria,2020-04-06,12297,220,3463,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,824.0,250.0
4064,Austria,2020-04-07,12639,243,4046,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,857.0,243.0
4126,Austria,2020-04-08,12942,273,4512,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,829.0,267.0
4188,Austria,2020-04-09,13244,295,5240,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,820.0,266.0
4250,Austria,2020-04-10,13555,319,6064,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,771.0,261.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45782,Sweden,2021-12-22,1268254,15259,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45829,Sweden,2021-12-23,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45876,Sweden,2021-12-24,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45923,Sweden,2021-12-25,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0


## 8. Exportacion `df` a `.csv`

In [62]:
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,Year,Month,Week,...,tests_done,population,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,Hospital_Occupancy,ICU_Occupancy
4002,Austria,2020-04-06,12297,220,3463,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,824.0,250.0
4064,Austria,2020-04-07,12639,243,4046,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,857.0,243.0
4126,Austria,2020-04-08,12942,273,4512,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,829.0,267.0
4188,Austria,2020-04-09,13244,295,5240,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,820.0,266.0
4250,Austria,2020-04-10,13555,319,6064,48.200000,16.366667,2020,4,15,...,12339.0,8901064.0,138.623877,14.895859,0.0,0.0,0.0,0.0,771.0,261.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45782,Sweden,2021-12-22,1268254,15259,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45829,Sweden,2021-12-23,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45876,Sweden,2021-12-24,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0
45923,Sweden,2021-12-25,1273313,15265,0,59.333333,18.050000,2021,12,51,...,386528.0,10327589.0,3742.674113,7.258206,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long',
       'Year', 'Month', 'Week', 'Day', 'Year-Week', 'tests_done', 'population',
       'testing_rate', 'positivity_rate', 'DosesReceived', 'DosesExported',
       'FirstDose', 'FirstDoseRefused', 'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

In [64]:
# Reordeno las Columnas
#    
# 'testing_rate', 'positivity_rate', 'DosesReceived', 'DosesExported','FirstDose', 'FirstDoseRefused', 'Hospital_Occupancy', 'ICU_Occupancy'

df = df[['Country','Lat', 'Long','Date', 'Year', 'Month', 'Week', 'Day', 'Year-Week','population','Confirmed', 'Deaths', 'Recovered',
         'tests_done', 'testing_rate', 'positivity_rate', 'DosesReceived', 'DosesExported','FirstDose', 'FirstDoseRefused', 'Hospital_Occupancy', 'ICU_Occupancy'   ]]

In [65]:
#Cambio nombre a las columnas en un mismo formato camelCase
df = df.rename(columns={'Country':'country',
                        'Lat':'latitude',
                        'Long':'longitude',
                        'Date':'date',
                        'Year':'year',
                        'Month':'month',
                        'Week':'week',
                        'Day':'day',
                        'Year-Week':'yearWeek',
                        'Confirmed':'confirmed',
                        'Recovered':'recovered',
                        'tests_done':'testDone',
                        'testing_rate':'testingRate',
                        'positivity_rate':'positivityRate',
                        'DosesReceived':'dosesReceived',
                        'DosesExported':'dosesExported',
                        'FirstDose':'firstDose',
                        'Hospital_Occupancy':'hospitalOccupancy',
                        'ICU_Occupancy':'IcuOccupancy',
                        })

In [73]:
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,country,latitude,longitude,date,year,month,week,day,yearWeek,population,...,recovered,testDone,testingRate,positivityRate,dosesReceived,dosesExported,firstDose,FirstDoseRefused,hospitalOccupancy,IcuOccupancy
0,Austria,48.2,16.366667,2020-04-06,2020,4,15,6,2020-15,8901064.0,...,3463,12339.0,138.623877,14.895859,0.0,0.0,0.0,0.0,824.0,250.0
1,Austria,48.2,16.366667,2020-04-07,2020,4,15,7,2020-15,8901064.0,...,4046,12339.0,138.623877,14.895859,0.0,0.0,0.0,0.0,857.0,243.0
2,Austria,48.2,16.366667,2020-04-08,2020,4,15,8,2020-15,8901064.0,...,4512,12339.0,138.623877,14.895859,0.0,0.0,0.0,0.0,829.0,267.0
3,Austria,48.2,16.366667,2020-04-09,2020,4,15,9,2020-15,8901064.0,...,5240,12339.0,138.623877,14.895859,0.0,0.0,0.0,0.0,820.0,266.0
4,Austria,48.2,16.366667,2020-04-10,2020,4,15,10,2020-15,8901064.0,...,6064,12339.0,138.623877,14.895859,0.0,0.0,0.0,0.0,771.0,261.0


In [67]:
df.to_csv('df.csv')