# Data Cleaning

## 1. Introduccion

## 2. Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## 3. Creacion de `df` utilizando datos de `CoreCode` en `data_core/`

### 3.1. Preparacion dataset `confirmed_global.csv`

In [2]:
#url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
#df1 = pd.read_csv(url_confirmed_global)
df1 = pd.read_csv('data_core/confirmed_global.csv')

In [3]:
# El analisis se va a hacer por pais, no por provincia de modo que elimino la columna 'Province/State'. Las columnas de 'Lat' y 'Long' 
# se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el 'groupby'.

df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

In [4]:
# Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos 
# los casos por pais que anteriormente estaban subdivididos por 'Province/State'.

# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [5]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/19/21,12/20/21,12/21/21,12/22/21,12/23/21,12/24/21,12/25/21,12/26/21,12/27/21,12/28/21
16,Austria,0,0,0,0,0,0,0,0,0,...,1249641,1251433,1253961,1256230,1258377,1260751,1262836,1264553,1266103,1268519


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/24/21          1260751
12/25/21          1262836
12/26/21          1264553
12/27/21          1266103
12/28/21          1268519
Length: 708, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

<div align="center">
Confirmamos que el groupby se ha completado con exito
<div>

In [8]:
# Mergeamos las columnas de 'Date-Countrty' por cada pais y anadimos una columna con su valor correspondiente

# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,1680985
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,469452
12/28/21Yemen,Yemen,12/28/21,10123
12/28/21Zambia,Zambia,12/28/21,238383


### 3.2. Preparacion dataset `deaths_global.csv`

Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
#url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
#df2 = pd.read_csv(url_deaths_global)
df2 = pd.read_csv('data_core/deaths_global.csv')

df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,31632
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,4912
12/28/21Yemen,Yemen,12/28/21,1984
12/28/21Zambia,Zambia,12/28/21,3716


### 3.3. Juntamos todos los dataframe `df1`, `df2` y `df3` en uno solo `df`

In [10]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

In [11]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y'], axis=1)

# Reordeno las Columnas
df = df.rename(columns={'Country/Region_x':'Country', 'Date_x':'Date'})
df = df[['Country','Date','Confirmed','Deaths']]

In [12]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 138572
Filas df sin duplicados: 138572
Hay 0 filas duplicadas


## 4. Anado datos geograficos y poblacion a `df`

### 4.1. Anado las columnas de `'Lat'` y `'Long'` al dataframe `df`

In [13]:
df4 = pd.read_csv("data_extra/concap.csv")
df4 = df4.drop(['CapitalName'], axis=1)
df4 = df4.drop_duplicates()

filter_continent = df4['ContinentName'] == 'Europe'
df4 = df4[filter_continent]

df4 = df4.rename(columns={'CountryName':'Country',
                          'CapitalLatitude':'Lat', 
                          'CapitalLongitude':'Long', 
                          'CountryCode':'geoId',
                          'ContinentName':'continentExp'})
df4.head(3)


Unnamed: 0,Country,Lat,Long,geoId,continentExp
4,Aland Islands,60.116667,19.9,AX,Europe
10,Albania,41.316667,19.816667,AL,Europe
13,Andorra,42.5,1.516667,AD,Europe


In [14]:
df = pd.merge(df, df4 , how='left', on='Country')

### 4.2. Filtrado de `df` por `'continentExP'`: `'Europe'`

Para poder aprovechar los dataset de data_extra, que estan centrados unicamente en Europa, y ademas poder centrar mejor el analisis, voy a filtrar el dataframe eliminando todos los paises que no son europeos.

In [15]:
# Filtro el dataframe (df) para paises europeos 
filter_europe = df['continentExp'] == 'Europe'
df = df[filter_europe]
df = df.sort_values(['Country','Date'], ascending=[True, True])
df = df.reset_index(drop=True)
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,geoId,continentExp
0,Albania,1/1/21,58316,1181,41.316667,19.816667,AL,Europe
1,Albania,1/10/21,63595,1241,41.316667,19.816667,AL,Europe
2,Albania,1/11/21,63971,1247,41.316667,19.816667,AL,Europe
3,Albania,1/12/21,64627,1252,41.316667,19.816667,AL,Europe
4,Albania,1/13/21,65334,1256,41.316667,19.816667,AL,Europe
...,...,...,...,...,...,...,...,...
33224,United Kingdom,9/7/21,7089051,133808,51.500000,-0.083333,GB,Europe
33225,United Kingdom,9/8/20,354934,41675,51.500000,-0.083333,GB,Europe
33226,United Kingdom,9/8/21,7127630,133999,51.500000,-0.083333,GB,Europe
33227,United Kingdom,9/9/20,357615,41683,51.500000,-0.083333,GB,Europe


In [16]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


## 5. Modificacion del indice de `df` y creacion de columnas `'Year'`, `'Week'` y `'Day'`

### 5.1. Cambio de tipo de datos

In [17]:
df.dtypes

Country          object
Date             object
Confirmed         int64
Deaths            int64
Lat             float64
Long            float64
geoId            object
continentExp     object
dtype: object

In [18]:
# Cabia Date a tipo datetime
df['Date'] = pd.to_datetime(df.Date)

### 5.2. Extraccion de nuevas columnas a traves de `'Date'`

In [19]:
# Declaro variables
y = df['Date'].dt
x = df['Date'].dt.isocalendar().week.apply(np.int64)

# Creo nuevas columnas con tipo int64
df['Year'] = y.year
df['Month'] = y.month
df['Week'] = x
df['Week-Copy'] = x
df['Day'] = y.day

# comprobamos que efectivamente las columnas se han creado como int64
df.dtypes

Country                 object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
Lat                    float64
Long                   float64
geoId                   object
continentExp            object
Year                     int64
Month                    int64
Week                     int64
Week-Copy                int64
Day                      int64
dtype: object

In [20]:
# Para poder mergear con los data set que tienen informacion a nivel 'year'-'week'
# Necesito poner un '0' delante de las semanas que sean menores de 10
# Esta columna al tener el '-', obligatoriamente sera de tipo 'object.

def str_fixer(value):
    if int(value) < 10:
        return f'0{value}'
    else:
        return str(value)

df["Week-Copy"] = df["Week-Copy"].apply(str_fixer)


In [21]:
# Comprobamos que funciona correctamente
df["Week-Copy"].unique()

array(['53', '01', '02', '03', '04', '05', '40', '39', '41', '42', '43',
       '44', '46', '45', '47', '48', '49', '50', '51', '52', '07', '06',
       '08', '09', '11', '10', '12', '13', '14', '15', '16', '17', '18',
       '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '30', '31', '33', '32', '34', '35', '36', '37', '38'], dtype=object)

In [22]:
# Creo los id que me serviran para mergear el resto de datasets
df["Year-Week"] = df["Year"].apply(str) + "-" + df["Week-Copy"]
df["Year-Week-Copy"] = df["Year"].apply(str) + "-W" + df["Week-Copy"]

In [23]:
df.head(3)

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,geoId,continentExp,Year,Month,Week,Week-Copy,Day,Year-Week,Year-Week-Copy
0,Albania,2021-01-01,58316,1181,41.316667,19.816667,AL,Europe,2021,1,53,53,1,2021-53,2021-W53
1,Albania,2021-01-10,63595,1241,41.316667,19.816667,AL,Europe,2021,1,1,1,10,2021-01,2021-W01
2,Albania,2021-01-11,63971,1247,41.316667,19.816667,AL,Europe,2021,1,2,2,11,2021-02,2021-W02


In [24]:
df.dtypes

Country                   object
Date              datetime64[ns]
Confirmed                  int64
Deaths                     int64
Lat                      float64
Long                     float64
geoId                     object
continentExp              object
Year                       int64
Month                      int64
Week                       int64
Week-Copy                 object
Day                        int64
Year-Week                 object
Year-Week-Copy            object
dtype: object

In [25]:
df = df.sort_values(['Country','Date'], ascending=[True, True])
df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,geoId,continentExp,Year,Month,Week,Week-Copy,Day,Year-Week,Year-Week-Copy
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,Europe,2020,1,4,4,22,2020-04,2020-W04
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,Europe,2020,1,4,4,23,2020-04,2020-W04
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,Europe,2020,1,4,4,24,2020-04,2020-W04


## 6. Anado columnas de `'ConfirmedDay'` y  `'deathsDay'`

In [26]:
df['confirmedDay'] = df['Confirmed'].diff().fillna(0).astype(int)
df['deathsDay'] = df['Deaths'].diff().fillna(0).astype(int)
df


Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,geoId,continentExp,Year,Month,Week,Week-Copy,Day,Year-Week,Year-Week-Copy,confirmedDay,deathsDay
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,Europe,2020,1,4,04,22,2020-04,2020-W04,0,0
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,Europe,2020,1,4,04,23,2020-04,2020-W04,0,0
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,Europe,2020,1,4,04,24,2020-04,2020-W04,0,0
3,Albania,2020-01-25,0,0,41.316667,19.816667,AL,Europe,2020,1,4,04,25,2020-04,2020-W04,0,0
4,Albania,2020-01-26,0,0,41.316667,19.816667,AL,Europe,2020,1,4,04,26,2020-04,2020-W04,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,GB,Europe,2021,12,51,51,24,2021-51,2021-W51,121932,137
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,GB,Europe,2021,12,51,51,25,2021-51,2021-W51,87,0
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,GB,Europe,2021,12,51,51,26,2021-51,2021-W51,0,0
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,GB,Europe,2021,12,52,52,27,2021-52,2021-W52,318886,146


In [27]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


## 7. Union de los dataset: `ICU_hospital.csv`, `test_rate.csv` y `vaccine_tracker.csv` con `df`

### 7.1. Creacion de `id` en `df` para el mergeo con los dataset de `data_extra`

In [28]:
#Borro Columnas sobrantes
df = df.drop(['continentExp'], axis=1)

# Defino la columna que me serviran para mergear con otros dataset
df['id-merge'] = df['Country'] + df['Date'].apply(str)
df['id-merge'] = df['id-merge'].apply(lambda x: x.split(' ')[0])
df['id-merge']



0        Albania2020-01-22
1        Albania2020-01-23
2        Albania2020-01-24
3        Albania2020-01-25
4        Albania2020-01-26
               ...        
33224               United
33225               United
33226               United
33227               United
33228               United
Name: id-merge, Length: 33229, dtype: object

In [29]:
# Defino la columna que me serviran para mergear con otros dataset
df['id-merge-co'] = df['geoId'] + df['Date'].apply(str)
df['id-merge-co'] = df['id-merge-co'].apply(lambda x: x.split(' ')[0])
df['id-merge-co']

0        AL2020-01-22
1        AL2020-01-23
2        AL2020-01-24
3        AL2020-01-25
4        AL2020-01-26
             ...     
33224    GB2021-12-24
33225    GB2021-12-25
33226    GB2021-12-26
33227    GB2021-12-27
33228    GB2021-12-28
Name: id-merge-co, Length: 33229, dtype: object

In [30]:
# Creo df ara mergear los dataset que estan a nivel de linea year-week
df_merge = df.drop(df.columns.difference(['Date', 'Year-Week-Copy']), axis=1)
df_merge = df_merge.rename(columns={'Year-Week-Copy':'yearWeek'})
df_merge = df_merge.drop_duplicates(subset=['yearWeek'], keep='last')
df_merge['Date'] = df_merge['Date'].astype(str)
df_merge.head(3)

Unnamed: 0,Date,yearWeek
32526,2020-01-26,2020-W04
32533,2020-02-02,2020-W05
32540,2020-02-09,2020-W06


### 7.3. Preapracion dataset `test_rate.csv`

In [31]:
#url_test_rate = "https://opendata.ecdc.europa.eu/covid19/testing/csv/data.csv"
#df_ex1 = pd.read_csv(url_test_rate)
df_ex1 = pd.read_csv('data_extra/test_rate.csv')

In [32]:
df_ex1

Unnamed: 0,country,country_code,year_week,level,region,region_name,new_cases,tests_done,population,testing_rate,positivity_rate,testing_data_source
0,Austria,AT,2020-W15,national,AT,Austria,1838,12339,8901064.0,138.623877,14.895859,Manual webscraping
1,Austria,AT,2020-W16,national,AT,Austria,684,58488,8901064.0,657.089984,1.169471,Manual webscraping
2,Austria,AT,2020-W17,national,AT,Austria,448,33443,8901064.0,375.719128,1.339593,Manual webscraping
3,Austria,AT,2020-W18,national,AT,Austria,312,26598,8901064.0,298.818209,1.173021,Country website
4,Austria,AT,2020-W19,national,AT,Austria,264,42153,8901064.0,473.572598,0.626290,Country website
...,...,...,...,...,...,...,...,...,...,...,...,...
11750,Sweden,SE,2021-W46,national,SE,Sweden,7095,123920,10327589.0,1199.892831,5.725468,TESSy
11751,Sweden,SE,2021-W47,national,SE,Sweden,11916,226289,10327589.0,2191.111594,5.265833,TESSy
11752,Sweden,SE,2021-W48,national,SE,Sweden,13802,273987,10327589.0,2652.961887,5.037465,TESSy
11753,Sweden,SE,2021-W49,national,SE,Sweden,18659,335956,10327589.0,3252.995447,5.554001,TESSy


In [33]:
df_merge = df_merge.rename(columns={'yearWeek':'year_week'})
df_ex1 = pd.merge(df_ex1, df_merge, how='left', on='year_week')
df_ex1


Unnamed: 0,country,country_code,year_week,level,region,region_name,new_cases,tests_done,population,testing_rate,positivity_rate,testing_data_source,Date
0,Austria,AT,2020-W15,national,AT,Austria,1838,12339,8901064.0,138.623877,14.895859,Manual webscraping,2020-04-12
1,Austria,AT,2020-W16,national,AT,Austria,684,58488,8901064.0,657.089984,1.169471,Manual webscraping,2020-04-19
2,Austria,AT,2020-W17,national,AT,Austria,448,33443,8901064.0,375.719128,1.339593,Manual webscraping,2020-04-26
3,Austria,AT,2020-W18,national,AT,Austria,312,26598,8901064.0,298.818209,1.173021,Country website,2020-05-03
4,Austria,AT,2020-W19,national,AT,Austria,264,42153,8901064.0,473.572598,0.626290,Country website,2020-05-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11750,Sweden,SE,2021-W46,national,SE,Sweden,7095,123920,10327589.0,1199.892831,5.725468,TESSy,2021-11-21
11751,Sweden,SE,2021-W47,national,SE,Sweden,11916,226289,10327589.0,2191.111594,5.265833,TESSy,2021-11-28
11752,Sweden,SE,2021-W48,national,SE,Sweden,13802,273987,10327589.0,2652.961887,5.037465,TESSy,2021-12-05
11753,Sweden,SE,2021-W49,national,SE,Sweden,18659,335956,10327589.0,3252.995447,5.554001,TESSy,2021-12-12


In [34]:
df_ex1['level'].unique()

array(['national', 'subnational'], dtype=object)

In [35]:
# Dentro del dataset exiten datos a nivel nacional y subnacional. Vamos a filtrar por nacional, para descartar 
# todas las lineas por provicia, ya que nuestro analisis es a nivel nacional en Europa.

filter_national = df_ex1['level'] == 'national'
df_ex1 = df_ex1[filter_national]

In [36]:
# Eliminamos columnas no necesarias 

df_ex1 = df_ex1.drop(['region_name', 'new_cases', 'testing_data_source','region_name','level','region'], axis=1)

In [37]:
df_ex1['id-merge'] = df_ex1['country'] + df_ex1['Date']

df_ex1 = df_ex1.drop(['year_week', 'country_code','country'], axis=1)

df_ex1['population'] = df_ex1['population'].astype(int)


df_ex1

Unnamed: 0,tests_done,population,testing_rate,positivity_rate,Date,id-merge
0,12339,8901064,138.623877,14.895859,2020-04-12,Austria2020-04-12
1,58488,8901064,657.089984,1.169471,2020-04-19,Austria2020-04-19
2,33443,8901064,375.719128,1.339593,2020-04-26,Austria2020-04-26
3,26598,8901064,298.818209,1.173021,2020-05-03,Austria2020-05-03
4,42153,8901064,473.572598,0.626290,2020-05-10,Austria2020-05-10
...,...,...,...,...,...,...
11750,123920,10327589,1199.892831,5.725468,2021-11-21,Sweden2021-11-21
11751,226289,10327589,2191.111594,5.265833,2021-11-28,Sweden2021-11-28
11752,273987,10327589,2652.961887,5.037465,2021-12-05,Sweden2021-12-05
11753,335956,10327589,3252.995447,5.554001,2021-12-12,Sweden2021-12-12


### 7.4. Preparacion dataset `vaccine_tracker.csv`

In [38]:
#url_vaccine_tracker = "https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv"
#df_ex2 = pd.read_csv(url_vaccine_tracker)
df_ex2 = pd.read_csv('data_extra/vaccine_tracker.csv')
df_ex2

Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,UnknownDose,Region,TargetGroup,Vaccine,Population
0,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,MOD,8901064
1,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,JANSS,8901064
2,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,UNK,8901064
3,2020-W53,AT,8901064.0,61425.0,0.0,5243,,0,0,0,AT,ALL,COM,8901064
4,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,AZ,8901064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197511,2021-W51,SK,391090.0,0.0,0.0,23,,8,292,0,SK,Age70_79,MOD,5457873
197512,2021-W51,SK,391090.0,0.0,0.0,163,,201,1457,0,SK,Age70_79,COM,5457873
197513,2021-W51,SK,184680.0,0.0,0.0,1,,0,0,0,SK,Age80+,JANSS,5457873
197514,2021-W51,SK,184680.0,0.0,0.0,70,,76,451,0,SK,Age80+,COM,5457873


In [39]:
df_merge = df_merge.rename(columns={'year_week':'YearWeekISO'})
df_ex2 = pd.merge(df_ex2, df_merge, how='left', on='YearWeekISO')
df_ex2

Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,UnknownDose,Region,TargetGroup,Vaccine,Population,Date
0,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,MOD,8901064,2020-12-31
1,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,JANSS,8901064,2020-12-31
2,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,UNK,8901064,2020-12-31
3,2020-W53,AT,8901064.0,61425.0,0.0,5243,,0,0,0,AT,ALL,COM,8901064,2020-12-31
4,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,AZ,8901064,2020-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197511,2021-W51,SK,391090.0,0.0,0.0,23,,8,292,0,SK,Age70_79,MOD,5457873,2021-12-26
197512,2021-W51,SK,391090.0,0.0,0.0,163,,201,1457,0,SK,Age70_79,COM,5457873,2021-12-26
197513,2021-W51,SK,184680.0,0.0,0.0,1,,0,0,0,SK,Age80+,JANSS,5457873,2021-12-26
197514,2021-W51,SK,184680.0,0.0,0.0,70,,76,451,0,SK,Age80+,COM,5457873,2021-12-26


In [40]:
df_ex2['TargetGroup'].unique()

array(['ALL', 'Age0_4', 'Age10_14', 'Age15_17', 'Age18_24', 'Age25_49',
       'Age50_59', 'Age5_9', 'Age60_69', 'Age70_79', 'Age80+', 'Age<18',
       'AgeUNK', 'HCW', 'LTCF', '1_Age60+', '1_Age<60'], dtype=object)

In [41]:
filter_target = df_ex2['TargetGroup'] == 'ALL'
df_ex2 = df_ex2[filter_target]

df_ex2

Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,UnknownDose,Region,TargetGroup,Vaccine,Population,Date
0,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,MOD,8901064,2020-12-31
1,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,JANSS,8901064,2020-12-31
2,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,UNK,8901064,2020-12-31
3,2020-W53,AT,8901064.0,61425.0,0.0,5243,,0,0,0,AT,ALL,COM,8901064,2020-12-31
4,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,AZ,8901064,2020-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197466,2021-W50,SK,5457873.0,0.0,0.0,13402,,19690,141121,0,SK,ALL,COM,5457873,2021-12-19
197467,2021-W50,SK,5457873.0,0.0,0.0,1029,,1533,22251,0,SK,ALL,MOD,5457873,2021-12-19
197492,2021-W51,SK,5457873.0,0.0,0.0,1082,,3205,13691,0,SK,ALL,COM,5457873,2021-12-26
197493,2021-W51,SK,5457873.0,0.0,0.0,128,,129,2736,0,SK,ALL,MOD,5457873,2021-12-26


In [42]:
# Borro columnas sobrantes para el merge
df_ex2 = df_ex2.drop(['Denominator', 'DoseAdditional1', 'UnknownDose','Population','Vaccine','TargetGroup', 'YearWeekISO'], axis=1)

# Defino el id del merge
df_ex2['id-merge-co'] = df_ex2['ReportingCountry'] + df_ex2['Date']

# Borro columnas sobrantes usadas para crear el id
df_ex2 = df_ex2.drop(['ReportingCountry', 'Region','FirstDoseRefused'], axis=1)


df_ex2 = df_ex2.groupby(['id-merge-co']).sum().reset_index()

df_ex2

Unnamed: 0,id-merge-co,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose
0,AT2020-12-31,61425.0,0.0,5243,0
1,AT2021-01-10,61425.0,0.0,26181,0
2,AT2021-01-17,68625.0,0.0,84934,398
3,AT2021-01-24,58500.0,0.0,93267,4568
4,AT2021-01-31,54990.0,0.0,31517,17516
...,...,...,...,...,...
1548,SK2021-11-28,0.0,0.0,49023,6738
1549,SK2021-12-05,0.0,0.0,26133,7134
1550,SK2021-12-12,0.0,0.0,20317,14250
1551,SK2021-12-19,0.0,0.0,16742,21223


### 7.2. Preparacion dataset `ICU_hospital.csv`

In [43]:
#url_UCI = "https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv"
#df_ex3 = pd.read_csv(url_UCI)
df_ex3 = pd.read_csv('data_extra/ICU_hospital.csv')

In [44]:
df_ex3

Unnamed: 0,country,indicator,date,year_week,value,source,url
0,Austria,Daily hospital occupancy,2020-04-01,2020-W14,856.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
1,Austria,Daily hospital occupancy,2020-04-02,2020-W14,823.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
2,Austria,Daily hospital occupancy,2020-04-03,2020-W14,829.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
3,Austria,Daily hospital occupancy,2020-04-04,2020-W14,826.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
4,Austria,Daily hospital occupancy,2020-04-05,2020-W14,712.000000,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
...,...,...,...,...,...,...,...
31489,Sweden,Weekly new ICU admissions per 100k,2021-11-21,2021-W46,0.125876,"TESSy COVID-19, national daily data",
31490,Sweden,Weekly new ICU admissions per 100k,2021-11-28,2021-W47,0.329215,"TESSy COVID-19, national daily data",
31491,Sweden,Weekly new ICU admissions per 100k,2021-12-05,2021-W48,0.309850,"TESSy COVID-19, national daily data",
31492,Sweden,Weekly new ICU admissions per 100k,2021-12-12,2021-W49,0.445409,"TESSy COVID-19, national daily data",


In [45]:
print((df_ex3.isnull().sum()/len(df_ex3))*100)

country      0.000000
indicator    0.000000
date         0.000000
year_week    0.000000
value        0.000000
source       0.000000
url          9.862196
dtype: float64


In [46]:
df_ex3['indicator'].unique()

array(['Daily hospital occupancy', 'Daily ICU occupancy',
       'Weekly new hospital admissions per 100k',
       'Weekly new ICU admissions per 100k'], dtype=object)

In [47]:
# Divido el dataset en 4 dataset por indicator y luego los uno en uno solo por columnas
df_ex3['id-merge'] = df_ex3['country']+df_ex3['date']
df_ex3['id-merge']

0        Austria2020-04-01
1        Austria2020-04-02
2        Austria2020-04-03
3        Austria2020-04-04
4        Austria2020-04-05
               ...        
31489     Sweden2021-11-21
31490     Sweden2021-11-28
31491     Sweden2021-12-05
31492     Sweden2021-12-12
31493     Sweden2021-12-19
Name: id-merge, Length: 31494, dtype: object

In [48]:
# Borro columnas sobrantes
df_ex3 = df_ex3.drop(['year_week', 'source', 'url'], axis=1)

In [49]:
# Hospital_Occupancy

filter_uci = df_ex3['indicator'] == 'Daily hospital occupancy'
df_uci_1 = df_ex3[filter_uci]

# DEFINO EL DATAFRAME
df_uci_1 = df_uci_1.rename(columns={'value':'Hospital_Occupancy'})
df_uci_1 = df_uci_1.drop(['indicator','date','country'], axis=1)

df_uci_1

Unnamed: 0,Hospital_Occupancy,id-merge
0,856.0,Austria2020-04-01
1,823.0,Austria2020-04-02
2,829.0,Austria2020-04-03
3,826.0,Austria2020-04-04
4,712.0,Austria2020-04-05
...,...,...
30733,496.0,Sweden2021-12-15
30734,526.0,Sweden2021-12-16
30735,524.0,Sweden2021-12-17
30736,513.0,Sweden2021-12-18


In [50]:
# ICU occupancy

filter_uci = df_ex3['indicator'] == 'Daily ICU occupancy'
df_uci_2 = df_ex3[filter_uci]

#DEFINO EL DATAFRAME
df_uci_2 = df_uci_2.rename(columns={'value':'ICU_Occupancy'})
df_uci_2 = df_uci_2.drop(['indicator','date','country'], axis=1)

df_uci_2

Unnamed: 0,ICU_Occupancy,id-merge
628,215.0,Austria2020-04-01
629,219.0,Austria2020-04-02
630,245.0,Austria2020-04-03
631,245.0,Austria2020-04-04
632,244.0,Austria2020-04-05
...,...,...
31395,75.0,Sweden2021-12-15
31396,70.0,Sweden2021-12-16
31397,68.0,Sweden2021-12-17
31398,72.0,Sweden2021-12-18


In [51]:
df_ex3 = df_ex3.drop(['indicator','value','country'], axis=1)
df_ex3

Unnamed: 0,date,id-merge
0,2020-04-01,Austria2020-04-01
1,2020-04-02,Austria2020-04-02
2,2020-04-03,Austria2020-04-03
3,2020-04-04,Austria2020-04-04
4,2020-04-05,Austria2020-04-05
...,...,...
31489,2021-11-21,Sweden2021-11-21
31490,2021-11-28,Sweden2021-11-28
31491,2021-12-05,Sweden2021-12-05
31492,2021-12-12,Sweden2021-12-12


In [52]:
df_ex3 = pd.merge(df_ex3, df_uci_1 , how='left', on='id-merge')
df_ex3

Unnamed: 0,date,id-merge,Hospital_Occupancy
0,2020-04-01,Austria2020-04-01,856.0
1,2020-04-02,Austria2020-04-02,823.0
2,2020-04-03,Austria2020-04-03,829.0
3,2020-04-04,Austria2020-04-04,826.0
4,2020-04-05,Austria2020-04-05,712.0
...,...,...,...
31489,2021-11-21,Sweden2021-11-21,272.0
31490,2021-11-28,Sweden2021-11-28,293.0
31491,2021-12-05,Sweden2021-12-05,344.0
31492,2021-12-12,Sweden2021-12-12,451.0


In [53]:
df_ex3 = pd.merge(df_ex3, df_uci_2 , how='left', on='id-merge')
df_ex3

Unnamed: 0,date,id-merge,Hospital_Occupancy,ICU_Occupancy
0,2020-04-01,Austria2020-04-01,856.0,215.0
1,2020-04-02,Austria2020-04-02,823.0,219.0
2,2020-04-03,Austria2020-04-03,829.0,245.0
3,2020-04-04,Austria2020-04-04,826.0,245.0
4,2020-04-05,Austria2020-04-05,712.0,244.0
...,...,...,...,...
31489,2021-11-21,Sweden2021-11-21,272.0,31.0
31490,2021-11-28,Sweden2021-11-28,293.0,29.0
31491,2021-12-05,Sweden2021-12-05,344.0,46.0
31492,2021-12-12,Sweden2021-12-12,451.0,53.0


In [54]:
df_ex3 = df_ex3.drop(columns=['date'], axis=1)
df_ex3

Unnamed: 0,id-merge,Hospital_Occupancy,ICU_Occupancy
0,Austria2020-04-01,856.0,215.0
1,Austria2020-04-02,823.0,219.0
2,Austria2020-04-03,829.0,245.0
3,Austria2020-04-04,826.0,245.0
4,Austria2020-04-05,712.0,244.0
...,...,...,...
31489,Sweden2021-11-21,272.0,31.0
31490,Sweden2021-11-28,293.0,29.0
31491,Sweden2021-12-05,344.0,46.0
31492,Sweden2021-12-12,451.0,53.0


In [55]:
# sobrescribo el data frame con el data frame con los duplicados eliminados
df_DD = df_ex3.drop_duplicates()

print(f"Filas df: {df_ex3.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df_ex3.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 31494
Filas df sin duplicados: 17195
Hay 14299 filas duplicadas


In [56]:
df_ex3 = df_ex3.drop_duplicates()

In [57]:
df_DD = df_ex3.drop_duplicates()

print(f"Filas df: {df_ex3.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df_ex3.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 17195
Filas df sin duplicados: 17195
Hay 0 filas duplicadas


### 7.5. Mergeo: Enriquecimiento de `df` con datadet: `test_rate.csv`, `vaccine_tracker.csv` y `ICU_hospital.csv`

In [58]:
df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Lat', 'Long', 'geoId',
       'Year', 'Month', 'Week', 'Week-Copy', 'Day', 'Year-Week',
       'Year-Week-Copy', 'confirmedDay', 'deathsDay', 'id-merge',
       'id-merge-co'],
      dtype='object')

In [59]:
# Mergeo con el dataset de test
df = pd.merge(df, df_ex1 , how='left', on='id-merge')
print((df.isnull().sum()/len(df))*100)
df

Country             0.000000
Date_x              0.000000
Confirmed           0.000000
Deaths              0.000000
Lat                 0.000000
Long                0.000000
geoId               0.000000
Year                0.000000
Month               0.000000
Week                0.000000
Week-Copy           0.000000
Day                 0.000000
Year-Week           0.000000
Year-Week-Copy      0.000000
confirmedDay        0.000000
deathsDay           0.000000
id-merge            0.000000
id-merge-co         0.000000
tests_done         91.916699
population         91.916699
testing_rate       91.916699
positivity_rate    91.940775
Date_y             91.916699
dtype: float64


Unnamed: 0,Country,Date_x,Confirmed,Deaths,Lat,Long,geoId,Year,Month,Week,...,Year-Week-Copy,confirmedDay,deathsDay,id-merge,id-merge-co,tests_done,population,testing_rate,positivity_rate,Date_y
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,2020,1,4,...,2020-W04,0,0,Albania2020-01-22,AL2020-01-22,,,,,
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,2020,1,4,...,2020-W04,0,0,Albania2020-01-23,AL2020-01-23,,,,,
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,2020,1,4,...,2020-W04,0,0,Albania2020-01-24,AL2020-01-24,,,,,
3,Albania,2020-01-25,0,0,41.316667,19.816667,AL,2020,1,4,...,2020-W04,0,0,Albania2020-01-25,AL2020-01-25,,,,,
4,Albania,2020-01-26,0,0,41.316667,19.816667,AL,2020,1,4,...,2020-W04,0,0,Albania2020-01-26,AL2020-01-26,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,GB,2021,12,51,...,2021-W51,121932,137,United,GB2021-12-24,,,,,
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,2021-W51,87,0,United,GB2021-12-25,,,,,
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,2021-W51,0,0,United,GB2021-12-26,,,,,
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,GB,2021,12,52,...,2021-W52,318886,146,United,GB2021-12-27,,,,,


In [60]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


In [61]:
# merge con dataset vaccine 
df = pd.merge(df, df_ex2 , how='left', on='id-merge-co')
print((df.isnull().sum()/len(df))*100)
df

Country                 0.000000
Date_x                  0.000000
Confirmed               0.000000
Deaths                  0.000000
Lat                     0.000000
Long                    0.000000
geoId                   0.000000
Year                    0.000000
Month                   0.000000
Week                    0.000000
Week-Copy               0.000000
Day                     0.000000
Year-Week               0.000000
Year-Week-Copy          0.000000
confirmedDay            0.000000
deathsDay               0.000000
id-merge                0.000000
id-merge-co             0.000000
tests_done             91.916699
population             91.916699
testing_rate           91.916699
positivity_rate        91.940775
Date_y                 91.916699
NumberDosesReceived    95.639351
NumberDosesExported    95.639351
FirstDose              95.639351
SecondDose             95.639351
dtype: float64


Unnamed: 0,Country,Date_x,Confirmed,Deaths,Lat,Long,geoId,Year,Month,Week,...,id-merge-co,tests_done,population,testing_rate,positivity_rate,Date_y,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,2020,1,4,...,AL2020-01-22,,,,,,,,,
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,2020,1,4,...,AL2020-01-23,,,,,,,,,
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,2020,1,4,...,AL2020-01-24,,,,,,,,,
3,Albania,2020-01-25,0,0,41.316667,19.816667,AL,2020,1,4,...,AL2020-01-25,,,,,,,,,
4,Albania,2020-01-26,0,0,41.316667,19.816667,AL,2020,1,4,...,AL2020-01-26,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,GB,2021,12,51,...,GB2021-12-24,,,,,,,,,
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,GB2021-12-25,,,,,,,,,
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,GB2021-12-26,,,,,,,,,
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,GB,2021,12,52,...,GB2021-12-27,,,,,,,,,


In [62]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


In [63]:
df = pd.merge(df, df_ex3 , how='left', on='id-merge')
print((df.isnull().sum()/len(df))*100)
df

Country                 0.000000
Date_x                  0.000000
Confirmed               0.000000
Deaths                  0.000000
Lat                     0.000000
Long                    0.000000
geoId                   0.000000
Year                    0.000000
Month                   0.000000
Week                    0.000000
Week-Copy               0.000000
Day                     0.000000
Year-Week               0.000000
Year-Week-Copy          0.000000
confirmedDay            0.000000
deathsDay               0.000000
id-merge                0.000000
id-merge-co             0.000000
tests_done             91.916699
population             91.916699
testing_rate           91.916699
positivity_rate        91.940775
Date_y                 91.916699
NumberDosesReceived    95.639351
NumberDosesExported    95.639351
FirstDose              95.639351
SecondDose             95.639351
Hospital_Occupancy     55.099461
ICU_Occupancy          63.441572
dtype: float64


Unnamed: 0,Country,Date_x,Confirmed,Deaths,Lat,Long,geoId,Year,Month,Week,...,population,testing_rate,positivity_rate,Date_y,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
3,Albania,2020-01-25,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
4,Albania,2020-01-26,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,GB,2021,12,52,...,,,,,,,,,,


In [64]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


In [65]:
df.columns

Index(['Country', 'Date_x', 'Confirmed', 'Deaths', 'Lat', 'Long', 'geoId',
       'Year', 'Month', 'Week', 'Week-Copy', 'Day', 'Year-Week',
       'Year-Week-Copy', 'confirmedDay', 'deathsDay', 'id-merge',
       'id-merge-co', 'tests_done', 'population', 'testing_rate',
       'positivity_rate', 'Date_y', 'NumberDosesReceived',
       'NumberDosesExported', 'FirstDose', 'SecondDose', 'Hospital_Occupancy',
       'ICU_Occupancy'],
      dtype='object')

In [66]:
df

Unnamed: 0,Country,Date_x,Confirmed,Deaths,Lat,Long,geoId,Year,Month,Week,...,population,testing_rate,positivity_rate,Date_y,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
1,Albania,2020-01-23,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
2,Albania,2020-01-24,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
3,Albania,2020-01-25,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
4,Albania,2020-01-26,0,0,41.316667,19.816667,AL,2020,1,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,GB,2021,12,51,...,,,,,,,,,,
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,GB,2021,12,52,...,,,,,,,,,,


## 8. Dataframe final `df`

### 8.1. Limpieza de columnas `df`

In [67]:
df = df.drop(['id-merge', 'Year-Week-Copy', 'Week-Copy', 'geoId','id-merge-co'], axis=1)
df = df.rename(columns={'NumberDosesReceived':'dosesReceived',
                        'NumberDosesExported':'dosesExported',
                        'Date_x':'Date'})

df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Lat', 'Long', 'Year',
       'Month', 'Week', 'Day', 'Year-Week', 'confirmedDay', 'deathsDay',
       'tests_done', 'population', 'testing_rate', 'positivity_rate', 'Date_y',
       'dosesReceived', 'dosesExported', 'FirstDose', 'SecondDose',
       'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

In [68]:
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,Year,Month,Week,Day,...,population,testing_rate,positivity_rate,Date_y,dosesReceived,dosesExported,FirstDose,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,Albania,2020-01-22,0,0,41.316667,19.816667,2020,1,4,22,...,,,,,,,,,,
1,Albania,2020-01-23,0,0,41.316667,19.816667,2020,1,4,23,...,,,,,,,,,,
2,Albania,2020-01-24,0,0,41.316667,19.816667,2020,1,4,24,...,,,,,,,,,,
3,Albania,2020-01-25,0,0,41.316667,19.816667,2020,1,4,25,...,,,,,,,,,,
4,Albania,2020-01-26,0,0,41.316667,19.816667,2020,1,4,26,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33224,United Kingdom,2021-12-24,11958841,148324,51.500000,-0.083333,2021,12,51,24,...,,,,,,,,,,
33225,United Kingdom,2021-12-25,11958928,148324,51.500000,-0.083333,2021,12,51,25,...,,,,,,,,,,
33226,United Kingdom,2021-12-26,11958928,148324,51.500000,-0.083333,2021,12,51,26,...,,,,,,,,,,
33227,United Kingdom,2021-12-27,12277814,148470,51.500000,-0.083333,2021,12,52,27,...,,,,,,,,,,


### 7.3. Rellenando valores nulos

In [69]:
'''
# con este filtro voy a eliminar varios paises que no aparecen directamente en mis datos enriquecidos 
filter_Confirmed_0 = df['population'] <= 0
df = df[filter_Confirmed_0]
df
'''

"\n# con este filtro voy a eliminar varios paises que no aparecen directamente en mis datos enriquecidos \nfilter_Confirmed_0 = df['population'] <= 0\ndf = df[filter_Confirmed_0]\ndf\n"

In [70]:
'''
df['positivity_rate'] = df['positivity_rate'].fillna(0)

df['FirstDose'] = df['FirstDose'].fillna(0)
df['DosesReceived'] = df['DosesReceived'].fillna(0)
df['DosesExported'] = df['DosesExported'].fillna(0)

df['Hospital_Occupancy'] = df['Hospital_Occupancy'].fillna(0)
df['ICU_Occupancy'] = df['ICU_Occupancy'].fillna(0)

print((df.isnull().sum()/len(df))*100)
print(df.shape)
'''


"\ndf['positivity_rate'] = df['positivity_rate'].fillna(0)\n\ndf['FirstDose'] = df['FirstDose'].fillna(0)\ndf['DosesReceived'] = df['DosesReceived'].fillna(0)\ndf['DosesExported'] = df['DosesExported'].fillna(0)\n\ndf['Hospital_Occupancy'] = df['Hospital_Occupancy'].fillna(0)\ndf['ICU_Occupancy'] = df['ICU_Occupancy'].fillna(0)\n\nprint((df.isnull().sum()/len(df))*100)\nprint(df.shape)\n"

In [71]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas


In [72]:
print((df.isnull().sum()/len(df))*100)
print(df.shape)

Country                0.000000
Date                   0.000000
Confirmed              0.000000
Deaths                 0.000000
Lat                    0.000000
Long                   0.000000
Year                   0.000000
Month                  0.000000
Week                   0.000000
Day                    0.000000
Year-Week              0.000000
confirmedDay           0.000000
deathsDay              0.000000
tests_done            91.916699
population            91.916699
testing_rate          91.916699
positivity_rate       91.940775
Date_y                91.916699
dosesReceived         95.639351
dosesExported         95.639351
FirstDose             95.639351
SecondDose            95.639351
Hospital_Occupancy    55.099461
ICU_Occupancy         63.441572
dtype: float64
(33229, 24)


In [73]:
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Hay 319233 registros nulos en total


In [74]:
df['Country'].unique()

array(['Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan',
       'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria',
       'Croatia', 'Cyprus', 'Denmark', 'Estonia', 'Finland', 'France',
       'Georgia', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland',
       'Italy', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania',
       'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro',
       'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia',
       'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden',
       'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom'], dtype=object)

In [75]:
Espana = df.loc[:, 'Country'] == 'Spain'
df = df.loc[Espana]

In [76]:
print((df.isnull().sum()/len(df))*100)
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Country                0.000000
Date                   0.000000
Confirmed              0.000000
Deaths                 0.000000
Lat                    0.000000
Long                   0.000000
Year                   0.000000
Month                  0.000000
Week                   0.000000
Day                    0.000000
Year-Week              0.000000
confirmedDay           0.000000
deathsDay              0.000000
tests_done            91.089109
population            91.089109
testing_rate          91.089109
positivity_rate       91.089109
Date_y                91.089109
dosesReceived         92.786421
dosesExported         92.786421
FirstDose             92.786421
SecondDose            92.786421
Hospital_Occupancy    55.162659
ICU_Occupancy         55.162659
dtype: float64
Hay 6624 registros nulos en total


In [77]:
df = df.sort_values(['Country','Date'], ascending=[True, True])
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,Year,Month,Week,Day,...,population,testing_rate,positivity_rate,Date_y,dosesReceived,dosesExported,FirstDose,SecondDose,Hospital_Occupancy,ICU_Occupancy
28987,Spain,2020-01-22,0,0,40.4,-3.683333,2020,1,4,22,...,,,,,,,,,,
28988,Spain,2020-01-23,0,0,40.4,-3.683333,2020,1,4,23,...,,,,,,,,,,
28989,Spain,2020-01-24,0,0,40.4,-3.683333,2020,1,4,24,...,,,,,,,,,,
28990,Spain,2020-01-25,0,0,40.4,-3.683333,2020,1,4,25,...,,,,,,,,,,
28991,Spain,2020-01-26,0,0,40.4,-3.683333,2020,1,4,26,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29689,Spain,2021-12-24,5718007,89019,40.4,-3.683333,2021,12,51,24,...,,,,,,,,,,
29690,Spain,2021-12-25,5718007,89019,40.4,-3.683333,2021,12,51,25,...,,,,,,,,,,
29691,Spain,2021-12-26,5718007,89019,40.4,-3.683333,2021,12,51,26,...,,,,,,,,,,
29692,Spain,2021-12-27,5932626,89139,40.4,-3.683333,2021,12,52,27,...,,,,,,,,,,


## 8. Exportacion `df` a `.csv`

In [78]:
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Lat,Long,Year,Month,Week,Day,...,population,testing_rate,positivity_rate,Date_y,dosesReceived,dosesExported,FirstDose,SecondDose,Hospital_Occupancy,ICU_Occupancy
28987,Spain,2020-01-22,0,0,40.4,-3.683333,2020,1,4,22,...,,,,,,,,,,
28988,Spain,2020-01-23,0,0,40.4,-3.683333,2020,1,4,23,...,,,,,,,,,,
28989,Spain,2020-01-24,0,0,40.4,-3.683333,2020,1,4,24,...,,,,,,,,,,
28990,Spain,2020-01-25,0,0,40.4,-3.683333,2020,1,4,25,...,,,,,,,,,,
28991,Spain,2020-01-26,0,0,40.4,-3.683333,2020,1,4,26,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29689,Spain,2021-12-24,5718007,89019,40.4,-3.683333,2021,12,51,24,...,,,,,,,,,,
29690,Spain,2021-12-25,5718007,89019,40.4,-3.683333,2021,12,51,25,...,,,,,,,,,,
29691,Spain,2021-12-26,5718007,89019,40.4,-3.683333,2021,12,51,26,...,,,,,,,,,,
29692,Spain,2021-12-27,5932626,89139,40.4,-3.683333,2021,12,52,27,...,,,,,,,,,,


In [79]:
df.columns

Index(['Country', 'Date', 'Confirmed', 'Deaths', 'Lat', 'Long', 'Year',
       'Month', 'Week', 'Day', 'Year-Week', 'confirmedDay', 'deathsDay',
       'tests_done', 'population', 'testing_rate', 'positivity_rate', 'Date_y',
       'dosesReceived', 'dosesExported', 'FirstDose', 'SecondDose',
       'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

In [80]:
# Reordeno las Columnas
#    
# 'testing_rate', 'positivity_rate', 'DosesReceived', 'DosesExported','FirstDose', 'FirstDoseRefused', 'Hospital_Occupancy', 'ICU_Occupancy'

df = df[['Country','Lat', 'Long','Date', 'Year', 'Month', 'Week', 'Day', 'Year-Week','population','Confirmed', 'Deaths', 'confirmedDay', 'deathsDay',
         'tests_done', 'testing_rate', 'positivity_rate', 'dosesReceived', 'dosesExported','FirstDose', 'Hospital_Occupancy', 'ICU_Occupancy'   ]]

In [81]:
#Cambio nombre a las columnas en un mismo formato camelCase
df = df.rename(columns={'Country':'country',
                        'Lat':'latitude',
                        'Long':'longitude',
                        'Date':'date',
                        'Year':'year',
                        'Month':'month',
                        'Week':'week',
                        'Day':'day',
                        'Year-Week':'yearWeek',
                        'Confirmed':'confirmed',
                        'Deaths':'deaths',
                        'tests_done':'testDone',
                        'testing_rate':'testingRate',
                        'positivity_rate':'positivityRate',
                        'FirstDose':'firstDose',
                        'Hospital_Occupancy':'hospitalOccupancy',
                        'ICU_Occupancy':'IcuOccupancy',
                        })

In [82]:
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,country,latitude,longitude,date,year,month,week,day,yearWeek,population,...,confirmedDay,deathsDay,testDone,testingRate,positivityRate,dosesReceived,dosesExported,firstDose,hospitalOccupancy,IcuOccupancy
0,Spain,40.4,-3.683333,2020-01-22,2020,1,4,22,2020-04,,...,-458568,-5566,,,,,,,,
1,Spain,40.4,-3.683333,2020-01-23,2020,1,4,23,2020-04,,...,0,0,,,,,,,,
2,Spain,40.4,-3.683333,2020-01-24,2020,1,4,24,2020-04,,...,0,0,,,,,,,,
3,Spain,40.4,-3.683333,2020-01-25,2020,1,4,25,2020-04,,...,0,0,,,,,,,,
4,Spain,40.4,-3.683333,2020-01-26,2020,1,4,26,2020-04,,...,0,0,,,,,,,,


In [83]:
df['testingRate'] = np.round(df['testingRate'], 
                       decimals = 2) 

df['positivityRate'] = np.round(df['positivityRate'], 
                       decimals = 2) 


In [84]:
df['population'] = df['population'].astype(int)
df['testDone'] = df['testDone'].astype(int)

df['dosesReceived'] = df['dosesReceived'].astype(int)
df['dosesExported'] = df['dosesExported'].astype(int)
df['firstDose'] = df['firstDose'].astype(int)
df['hospitalOccupancy'] = df['hospitalOccupancy'].astype(int)
df['IcuOccupancy'] = df['IcuOccupancy'].astype(int)


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
df

In [None]:
df.dtypes


In [None]:
df.to_csv('df.csv')