# Data Cleaning

## 1. Introduccion

## 2. Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## 3. Creacion de `df` utilizando datos de `CoreCode` en `data_core/`

### 3.1. Preparacion dataset `confirmed_global.csv`

In [2]:
url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
df1 = pd.read_csv(url_confirmed_global)
#df1 = pd.read_csv('data_core/confirmed_global.csv')

In [3]:
# El analisis se va a hacer por pais, no por provincia de modo que elimino la columna 'Province/State'. Las columnas de 'Lat' y 'Long' 
# se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el 'groupby'.

df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

In [4]:
# Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos 
# los casos por pais que anteriormente estaban subdivididos por 'Province/State'.

# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [5]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/26/21,12/27/21,12/28/21,12/29/21,12/30/21,12/31/21,1/1/22,1/2/22,1/3/22,1/4/22
16,Austria,0,0,0,0,0,0,0,0,0,...,1264553,1266103,1268519,1271770,1274995,1278619,1282227,1285510,1288829,1294325


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/31/21          1278619
1/1/22            1282227
1/2/22            1285510
1/3/22            1288829
1/4/22            1294325
Length: 715, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

<div align="center">
Confirmamos que el groupby se ha completado con exito
<div>

In [8]:
# Mergeamos las columnas de 'Date-Countrty' por cada pais y anadimos una columna con su valor correspondiente

# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Elimino espacios entre palabras
df1['Country/Region'] = df1['Country/Region'].apply(lambda x: x.replace(' ',''))

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
1/4/22Vietnam,Vietnam,1/4/22,1800704
1/4/22WestBankandGaza,WestBankandGaza,1/4/22,471090
1/4/22Yemen,Yemen,1/4/22,10143
1/4/22Zambia,Zambia,1/4/22,265479


### 3.2. Preparacion dataset `deaths_global.csv`

Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
df2 = pd.read_csv(url_deaths_global)
#df2 = pd.read_csv('data_core/deaths_global.csv')

df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Country/Region'] = df2['Country/Region'].apply(lambda x: x.replace(' ',''))
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
1/4/22Vietnam,Vietnam,1/4/22,33245
1/4/22WestBankandGaza,WestBankandGaza,1/4/22,4947
1/4/22Yemen,Yemen,1/4/22,1984
1/4/22Zambia,Zambia,1/4/22,3762


### 3.3. Juntamos todos los dataframe `df1` y `df2` en uno solo `df`

In [10]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

In [11]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y'], axis=1)

# Reordeno las Columnas
df = df.rename(columns={'Country/Region_x':'country', 'Date_x':'date','Confirmed':'totalConfirmed','Deaths':'totalDeaths'})
df = df[['country','date','totalConfirmed','totalDeaths']]

In [12]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 139944
Filas df sin duplicados: 139944
Hay 0 filas duplicadas


In [13]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values(['country','date'], ascending=[True, True])
df = df.reset_index(drop=True)

In [14]:
df['confirmedDay'] = df['totalConfirmed'].diff().fillna(0).astype(int)
df['deathsDay'] = df['totalDeaths'].diff().fillna(0).astype(int)
df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay
0,Afghanistan,2020-01-22,0,0,0,0
1,Afghanistan,2020-01-23,0,0,0,0
2,Afghanistan,2020-01-24,0,0,0,0
3,Afghanistan,2020-01-25,0,0,0,0
4,Afghanistan,2020-01-26,0,0,0,0
...,...,...,...,...,...,...
139939,Zimbabwe,2021-12-31,213258,5004,1530,7
139940,Zimbabwe,2022-01-01,214214,5017,956,13
139941,Zimbabwe,2022-01-02,214214,5017,0,0
139942,Zimbabwe,2022-01-03,216087,5047,1873,30


In [15]:
filter_colum_day = df['confirmedDay'] > 0 
df = df[filter_colum_day]

filter_colum_day = df['deathsDay'] > 0 
df = df[filter_colum_day]

df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay
61,Afghanistan,2020-03-23,40,1,6,1
64,Afghanistan,2020-03-26,80,2,6,1
67,Afghanistan,2020-03-29,114,4,8,2
72,Afghanistan,2020-04-03,269,5,34,1
74,Afghanistan,2020-04-05,299,7,29,2
...,...,...,...,...,...,...
139938,Zimbabwe,2021-12-30,211728,4997,4180,57
139939,Zimbabwe,2021-12-31,213258,5004,1530,7
139940,Zimbabwe,2022-01-01,214214,5017,956,13
139942,Zimbabwe,2022-01-03,216087,5047,1873,30


In [16]:
df.dtypes

country                   object
date              datetime64[ns]
totalConfirmed             int64
totalDeaths                int64
confirmedDay               int64
deathsDay                  int64
dtype: object

## 4. Anado datos geograficos y poblacion a `df`

### 4.1. Importancion de datos 

In [17]:
df4 = pd.read_csv("../data_extra/concap.csv")

### 4.2. Preaparacion del dataframe

In [18]:
df4['CountryName'] = df4['CountryName'].apply(lambda x: x.replace(' ',''))


In [19]:
df4 = df4.rename(columns={'CountryName':'country',
                          'CapitalLatitude':'latitude', 
                          'CapitalLongitude':'longitude', 
                          'CountryCode':'geoId',
                          'ContinentName':'continentExp'})
df4

Unnamed: 0,country,CapitalName,latitude,longitude,geoId,continentExp
0,Somaliland,Hargeisa,9.550000,44.050000,,Africa
1,SouthGeorgiaandSouthSandwichIslands,King Edward Point,-54.283333,-36.500000,GS,Antarctica
2,FrenchSouthernandAntarcticLands,Port-aux-Français,-49.350000,70.216667,TF,Antarctica
3,Palestine,Jerusalem,31.766667,35.233333,PS,Asia
4,AlandIslands,Mariehamn,60.116667,19.900000,AX,Europe
...,...,...,...,...,...,...
240,NorthernCyprus,North Nicosia,35.183333,33.366667,,Europe
241,HongKong,,0.000000,0.000000,HK,Asia
242,HeardIslandandMcDonaldIslands,,0.000000,0.000000,HM,Antarctica
243,BritishIndianOceanTerritory,Diego Garcia,-7.300000,72.400000,IO,Africa


In [20]:
df_DD = df4.drop_duplicates()

print(f"Filas df4: {df4.shape[0]}\nFilas df4 sin duplicados: {df_DD.shape[0]}")

n_duplicados = df4.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

df4 = df4.dropna()
n_null = df4.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df4: 245
Filas df4 sin duplicados: 245
Hay 0 filas duplicadas
Hay 0 registros nulos en total


Control de calidad OK

In [21]:
df = pd.merge(df, df4 , how='left', on='country')
df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay,CapitalName,latitude,longitude,geoId,continentExp
0,Afghanistan,2020-03-23,40,1,6,1,Kabul,34.516667,69.183333,AF,Asia
1,Afghanistan,2020-03-26,80,2,6,1,Kabul,34.516667,69.183333,AF,Asia
2,Afghanistan,2020-03-29,114,4,8,2,Kabul,34.516667,69.183333,AF,Asia
3,Afghanistan,2020-04-03,269,5,34,1,Kabul,34.516667,69.183333,AF,Asia
4,Afghanistan,2020-04-05,299,7,29,2,Kabul,34.516667,69.183333,AF,Asia
...,...,...,...,...,...,...,...,...,...,...,...
67739,Zimbabwe,2021-12-30,211728,4997,4180,57,Harare,-17.816667,31.033333,ZW,Africa
67740,Zimbabwe,2021-12-31,213258,5004,1530,7,Harare,-17.816667,31.033333,ZW,Africa
67741,Zimbabwe,2022-01-01,214214,5017,956,13,Harare,-17.816667,31.033333,ZW,Africa
67742,Zimbabwe,2022-01-03,216087,5047,1873,30,Harare,-17.816667,31.033333,ZW,Africa


In [22]:
df_DD = df.drop_duplicates()

print(f"Filas df4: {df.shape[0]}\nFilas df4 sin duplicados: {df_DD.shape[0]}")

n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

df = df.dropna()
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df4: 67744
Filas df4 sin duplicados: 67744
Hay 0 filas duplicadas
Hay 0 registros nulos en total


Control de calidad ok

### 4.3. Filtro `df` por paises de `Europa`

In [23]:
#filter_europe = df['continentExp'] == 'Europe'
#df = df[filter_europe]

In [24]:
df = df.drop(['geoId', 'CapitalName'], axis=1)
df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay,latitude,longitude,continentExp
0,Afghanistan,2020-03-23,40,1,6,1,34.516667,69.183333,Asia
1,Afghanistan,2020-03-26,80,2,6,1,34.516667,69.183333,Asia
2,Afghanistan,2020-03-29,114,4,8,2,34.516667,69.183333,Asia
3,Afghanistan,2020-04-03,269,5,34,1,34.516667,69.183333,Asia
4,Afghanistan,2020-04-05,299,7,29,2,34.516667,69.183333,Asia
...,...,...,...,...,...,...,...,...,...
67739,Zimbabwe,2021-12-30,211728,4997,4180,57,-17.816667,31.033333,Africa
67740,Zimbabwe,2021-12-31,213258,5004,1530,7,-17.816667,31.033333,Africa
67741,Zimbabwe,2022-01-01,214214,5017,956,13,-17.816667,31.033333,Africa
67742,Zimbabwe,2022-01-03,216087,5047,1873,30,-17.816667,31.033333,Africa


In [25]:
df_DD = df.drop_duplicates()

print(f"Filas df4: {df.shape[0]}\nFilas df4 sin duplicados: {df_DD.shape[0]}")

n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

df = df.dropna()
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df4: 62891
Filas df4 sin duplicados: 62891
Hay 0 filas duplicadas
Hay 0 registros nulos en total


Dataframe prepardo para mergearlo y ser enriquecido

## 5. Union del dataset: `owid-covid-data.csv` con `df`

### 5.1. Creacion de `id` en `df` para el mergeo

In [26]:
# Defino la columna que me serviran para mergear con otros dataset
df['idmerge'] = df['country'] + df['date'].apply(str)
df['idmerge'] = df['idmerge'].apply(lambda x: x.split(' ')[0])
df['idmerge']

0        Afghanistan2020-03-23
1        Afghanistan2020-03-26
2        Afghanistan2020-03-29
3        Afghanistan2020-04-03
4        Afghanistan2020-04-05
                 ...          
67739       Zimbabwe2021-12-30
67740       Zimbabwe2021-12-31
67741       Zimbabwe2022-01-01
67742       Zimbabwe2022-01-03
67743       Zimbabwe2022-01-04
Name: idmerge, Length: 62891, dtype: object

In [27]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df: 62891
Filas df sin duplicados: 62891
Hay 0 filas duplicadas
Hay 0 registros nulos en total


### 5.1. Preaparcion del dataset `owid-covid-data.csv`

#### 5.1.1. Importacion de datos

In [28]:
url_confirmed_global = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
df_ex = pd.read_csv(url_confirmed_global)
#df_ex = pd.read_csv('data_extra/owid-covid-data.csv')

In [29]:
df_ex.dtypes

iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
human_development_index                    float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

#### 5.1.2. Tratamiento de las columnas

In [30]:
df_ex.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [31]:
# Elijo las columnas que son relevantes
df_ex = df_ex.drop(df_ex.columns.difference(['continent','location', 'date', 'icu_patients','hosp_patients',
                                          'total_tests','positive_rate','tests_per_case',
                                          'new_vaccinations','people_vaccinated_per_hundred',
                                          'people_fully_vaccinated_per_hundred','population']), axis=1)


In [32]:
df_ex = df_ex.rename(columns={'icu_patients':'icuPatients',
                          'hosp_patients':'hospPatients', 
                          'total_tests':'totalTests', 
                          'positive_rate':'positiveRate',
                          'tests_per_case':'testsPerCase',
                          'new_vaccinations':'newVaccinations',
                          'people_vaccinated_per_hundred':'vaccinatedPerHundred', 
                          'people_fully_vaccinated_per_hundred':'fullyVaccinatedPerHundred',
                          'location':'country'})

#### 5.1.3. Filtro el dataframe por paises `Europeos`

In [33]:
#filter_europe = df_ex['continent'] == 'Europe'
#df_ex = df_ex[filter_europe]

# Elimino la columna de 'continent'
df_ex = df_ex.drop(['continent'], axis=1)

In [34]:
df_ex['date'] = pd.to_datetime(df_ex.date)
df_ex = df_ex.sort_values(['country','date'], ascending=[True, True])
df_ex = df_ex.reset_index(drop=True)
df_ex['country'] = df_ex['country'].apply(lambda x: x.replace(' ',''))

df_ex

Unnamed: 0,country,date,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population
0,Afghanistan,2020-02-24,,,,,,,,,39835428.0
1,Afghanistan,2020-02-25,,,,,,,,,39835428.0
2,Afghanistan,2020-02-26,,,,,,,,,39835428.0
3,Afghanistan,2020-02-27,,,,,,,,,39835428.0
4,Afghanistan,2020-02-28,,,,,,,,,39835428.0
...,...,...,...,...,...,...,...,...,...,...,...
152230,Zimbabwe,2021-12-31,,,1707232.0,0.2858,3.5,,27.33,20.77,15092171.0
152231,Zimbabwe,2022-01-01,,,1710825.0,0.2791,3.6,11020.0,27.37,20.81,15092171.0
152232,Zimbabwe,2022-01-02,,,1713676.0,0.2561,3.9,6595.0,27.39,20.83,15092171.0
152233,Zimbabwe,2022-01-03,,,,,,7062.0,27.41,20.86,15092171.0


#### 5.1.4. Control de Calidad

In [35]:
df_DD = df_ex.drop_duplicates()

print(f"Filas df: {df_ex.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df_ex.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df_ex.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
print(df_ex.dtypes)

Filas df: 152235
Filas df sin duplicados: 152235
Hay 0 filas duplicadas
Hay 876874 registros nulos en total
country                              object
date                         datetime64[ns]
icuPatients                         float64
hospPatients                        float64
totalTests                          float64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                     float64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                          float64
dtype: object


Vemos que hay un grancantidad de registros nulos, que mercen implementar acciones para eliminarlos.

#### 5.1.5. Relleno de Registros Nulos

**Signifcado de las columnas del dataset**
- **icu_patients**: Número de pacientes con COVID-19 en unidades de cuidados 
intensivos (UCI) en un día determinado

- **hosp_patients**: Número de pacientes con COVID-19 en el hospital en un día determinado

- **total_tests**: Pruebas totales para COVID-19

- **positive_rate**: La proporción de pruebas de COVID-19 que son positivas, expresada como un 
promedio móvil de 7 días (esto es lo contrario de las pruebas por caso)

- **tests_per_case**: Pruebas realizadas por cada nuevo caso confirmado de COVID-19, dado como un 
promedio móvil de 7 días (esto es lo contrario de Positive_rate)

- **new_vaccinations**: Nuevas dosis de vacuna COVID-19 administradas 
(solo calculadas para días consecutivos)

- **people_vaccinated_per_hundred**: Número total de personas que recibieron al menos una dosis 
de vacuna por cada 100 personas en la población total

- **people_fully_vaccinated_per_hundred**: Número total de personas que recibieron todas las dosis prescritas por el 
protocolo de vacunación por cada 100 personas en la población total

- **population**: poblacion total del pais

##### 5.1.5.1 Columnas `IcuPatients`,  `hospPatients`, `positive_rate`, `tests_per_case` y `new_vaccinations`

Al ser valores No continuos y por dia, es decir que los registros de cada linea es aislado del resto e indivisual, los registros nulos debemos de rellenarlos con `0` puesto que no tenmos mas informacion. 

In [36]:
# Relleno valores nulos con 0
df_ex['icuPatients'] = df_ex['icuPatients'].fillna(0)
df_ex['hospPatients'] = df_ex['hospPatients'].fillna(0)
df_ex['positiveRate'] = df_ex['positiveRate'].fillna(0)
df_ex['testsPerCase'] = df_ex['testsPerCase'].fillna(0)
df_ex['newVaccinations'] = df_ex['newVaccinations'].fillna(0)

In [37]:
df_ex.dtypes

country                              object
date                         datetime64[ns]
icuPatients                         float64
hospPatients                        float64
totalTests                          float64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                     float64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                          float64
dtype: object

In [38]:
# Cambio tipo de dato a entero
df_ex['icuPatients'] = df_ex['icuPatients'].astype(int)
df_ex['hospPatients'] = df_ex['hospPatients'].astype(int)
df_ex['newVaccinations'] = df_ex['newVaccinations'].astype(int)

In [39]:
# Control de calidad de nulos
print(df_ex['icuPatients'].isnull().sum())
print(df_ex['hospPatients'].isnull().sum())
print(df_ex['positiveRate'].isnull().sum())
print(df_ex['testsPerCase'].isnull().sum())
print(df_ex['newVaccinations'].isnull().sum())

0
0
0
0
0


In [40]:
df_ex.dtypes

country                              object
date                         datetime64[ns]
icuPatients                           int64
hospPatients                          int64
totalTests                          float64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                       int64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                          float64
dtype: object

Control de Calidad ok

##### 5.1.5.2. Columnas `totalTests`, `vaccinatedPerHundred` y `fullyVaccinatedPerHundred`

Al ser valores acumulativo continuos por dia, los registros nulos debemos de rellenarlos con el registro anterior puesto que no tenemos mas informacion. 
Los Registros del primer dia en este momento son nulos. Rellenando ese dia con 0 despues podemos rellenarlos con el valor anterior, ya que en caso de que no haya datos, por lo mneos podemos decir que nop habran cambiado y no sera del todo incorrecto ya que son valores que van escalando con el paso del tiempo.

In [41]:
# Con esta nueva columnas sabremos cuando cambia de pais a nivel de fila. Ya que al ser el registo de country diferente al anterior, match sera igual 0 
df_ex['match'] = df_ex.country == df_ex.country.shift()

df_ex['match'].dtypes

dtype('bool')

In [42]:
# Cambio tipo de dato a match de boll a str
df_ex['match'] = df_ex['match'].astype(str)

# Replace de bool a 1 y 0
df_ex['match'] = df_ex['match'].replace('False', '0')
df_ex['match'] = df_ex['match'].replace('True', '1')


In [43]:
# Cuando match sea 0 el primer registro de un pais es 0. De esta forma puedo hacer un df.fillna(method='pad') sin que afecte a otros paises ya que el primer registro es 0
df_ex.loc[df_ex.match == '0', 'totalTests'] = '0'
df_ex.loc[df_ex.match == '0', 'vaccinatedPerHundred'] = '0'
df_ex.loc[df_ex.match == '0', 'fullyVaccinatedPerHundred'] = '0'
df_ex.head(2)

Unnamed: 0,country,date,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population,match
0,Afghanistan,2020-02-24,0,0,0.0,0.0,0.0,0,0.0,0.0,39835428.0,0
1,Afghanistan,2020-02-25,0,0,,0.0,0.0,0,,,39835428.0,1


In [44]:
df_ex['totalTests'] = df_ex['totalTests'].fillna(method='pad')
df_ex['vaccinatedPerHundred'] = df_ex['vaccinatedPerHundred'].fillna(method='pad')
df_ex['fullyVaccinatedPerHundred'] = df_ex['fullyVaccinatedPerHundred'].fillna(method='pad')
df_ex.head(5)

Unnamed: 0,country,date,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population,match
0,Afghanistan,2020-02-24,0,0,0,0.0,0.0,0,0,0,39835428.0,0
1,Afghanistan,2020-02-25,0,0,0,0.0,0.0,0,0,0,39835428.0,1
2,Afghanistan,2020-02-26,0,0,0,0.0,0.0,0,0,0,39835428.0,1
3,Afghanistan,2020-02-27,0,0,0,0.0,0.0,0,0,0,39835428.0,1
4,Afghanistan,2020-02-28,0,0,0,0.0,0.0,0,0,0,39835428.0,1


In [45]:
df_ex.dtypes

country                              object
date                         datetime64[ns]
icuPatients                           int64
hospPatients                          int64
totalTests                           object
positiveRate                        float64
testsPerCase                        float64
newVaccinations                       int64
vaccinatedPerHundred                 object
fullyVaccinatedPerHundred            object
population                          float64
match                                object
dtype: object

In [46]:
df_DD = df_ex.drop_duplicates()

print(f"Filas df: {df_ex.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df_ex.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df_ex.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
print(df_ex.isnull().sum())

Filas df: 152235
Filas df sin duplicados: 152235
Hay 0 filas duplicadas
Hay 1015 registros nulos en total
country                         0
date                            0
icuPatients                     0
hospPatients                    0
totalTests                      0
positiveRate                    0
testsPerCase                    0
newVaccinations                 0
vaccinatedPerHundred            0
fullyVaccinatedPerHundred       0
population                   1015
match                           0
dtype: int64


In [47]:
df_ex['totalTests'] = df_ex['totalTests'].astype(int)
df_ex['vaccinatedPerHundred'] = df_ex['vaccinatedPerHundred'].astype(float)
df_ex['fullyVaccinatedPerHundred'] = df_ex['fullyVaccinatedPerHundred'].astype(float)

##### 5.1.5.3. Columna `population`

In [48]:
df_ex[df_ex['population'].isnull() == True]

Unnamed: 0,country,date,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population,match
65253,International,2020-02-07,0,0,0,0.0,0.0,0,0.00,0.00,,0
65254,International,2020-02-08,0,0,0,0.0,0.0,0,0.00,0.00,,1
65255,International,2020-02-09,0,0,0,0.0,0.0,0,0.00,0.00,,1
65256,International,2020-02-10,0,0,0,0.0,0.0,0,0.00,0.00,,1
65257,International,2020-02-11,0,0,0,0.0,0.0,0,0.00,0.00,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
103476,NorthernCyprus,2021-11-22,0,0,0,0.0,0.0,0,73.07,70.18,,1
103477,NorthernCyprus,2021-11-23,0,0,0,0.0,0.0,0,73.07,70.18,,1
103478,NorthernCyprus,2021-11-24,0,0,0,0.0,0.0,0,73.07,70.18,,1
103479,NorthernCyprus,2021-11-25,0,0,0,0.0,0.0,0,73.07,70.18,,1


In [49]:
df_ex[df_ex['population'].isnull() == True]['country'].unique()

array(['International', 'NorthernCyprus'], dtype=object)

Vemos que los nulos correspondes a paises que no interesan para el analisis, ya que no se encuentran en `df`. Por lo tanto los eliminaremos

In [50]:
df_ex =  df_ex.dropna()

In [51]:
df_DD = df_ex.drop_duplicates()

print(f"Filas df: {df_ex.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df_ex.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df_ex.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
print(df_ex.isnull().sum())

Filas df: 151220
Filas df sin duplicados: 151220
Hay 0 filas duplicadas
Hay 0 registros nulos en total
country                      0
date                         0
icuPatients                  0
hospPatients                 0
totalTests                   0
positiveRate                 0
testsPerCase                 0
newVaccinations              0
vaccinatedPerHundred         0
fullyVaccinatedPerHundred    0
population                   0
match                        0
dtype: int64


Control de calidad ok

In [52]:
# Cambio population a int
df_ex['population'] = df_ex['population'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ex['population'] = df_ex['population'].astype(int)


#### 5.1.6. Creando `id` para mergear con `df`

In [53]:
# Defino la columna que me serviran para mergear con otros dataset
df_ex['idmerge'] = df_ex['country'] + df_ex['date'].apply(str)
df_ex['idmerge'] = df_ex['idmerge'].apply(lambda x: x.split(' ')[0])
df_ex['idmerge']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ex['idmerge'] = df_ex['country'] + df_ex['date'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ex['idmerge'] = df_ex['idmerge'].apply(lambda x: x.split(' ')[0])


0         Afghanistan2020-02-24
1         Afghanistan2020-02-25
2         Afghanistan2020-02-26
3         Afghanistan2020-02-27
4         Afghanistan2020-02-28
                  ...          
152230       Zimbabwe2021-12-31
152231       Zimbabwe2022-01-01
152232       Zimbabwe2022-01-02
152233       Zimbabwe2022-01-03
152234       Zimbabwe2022-01-04
Name: idmerge, Length: 151220, dtype: object

In [54]:
# Elimino columnas duplicados con df
df_ex = df_ex.drop(['country', 'date', 'match'], axis=1)


Control de calidad ok. 
Dataframe listo para mergear

## 6. Enriquecimiento de `df`

Realizamos inner porque no me interesan registros que esten en un df y en otro no. Son dos datasets que se van actualizando diariamente pero puedo hacer diferencias entre paises en los ultimos dias. pero no resulta sigficativo para dicha tarea.

In [55]:
df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay,latitude,longitude,continentExp,idmerge
0,Afghanistan,2020-03-23,40,1,6,1,34.516667,69.183333,Asia,Afghanistan2020-03-23
1,Afghanistan,2020-03-26,80,2,6,1,34.516667,69.183333,Asia,Afghanistan2020-03-26
2,Afghanistan,2020-03-29,114,4,8,2,34.516667,69.183333,Asia,Afghanistan2020-03-29
3,Afghanistan,2020-04-03,269,5,34,1,34.516667,69.183333,Asia,Afghanistan2020-04-03
4,Afghanistan,2020-04-05,299,7,29,2,34.516667,69.183333,Asia,Afghanistan2020-04-05
...,...,...,...,...,...,...,...,...,...,...
67739,Zimbabwe,2021-12-30,211728,4997,4180,57,-17.816667,31.033333,Africa,Zimbabwe2021-12-30
67740,Zimbabwe,2021-12-31,213258,5004,1530,7,-17.816667,31.033333,Africa,Zimbabwe2021-12-31
67741,Zimbabwe,2022-01-01,214214,5017,956,13,-17.816667,31.033333,Africa,Zimbabwe2022-01-01
67742,Zimbabwe,2022-01-03,216087,5047,1873,30,-17.816667,31.033333,Africa,Zimbabwe2022-01-03


In [56]:
df_ex

Unnamed: 0,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population,idmerge
0,0,0,0,0.0000,0.0,0,0.00,0.00,39835428,Afghanistan2020-02-24
1,0,0,0,0.0000,0.0,0,0.00,0.00,39835428,Afghanistan2020-02-25
2,0,0,0,0.0000,0.0,0,0.00,0.00,39835428,Afghanistan2020-02-26
3,0,0,0,0.0000,0.0,0,0.00,0.00,39835428,Afghanistan2020-02-27
4,0,0,0,0.0000,0.0,0,0.00,0.00,39835428,Afghanistan2020-02-28
...,...,...,...,...,...,...,...,...,...,...
152230,0,0,1707232,0.2858,3.5,0,27.33,20.77,15092171,Zimbabwe2021-12-31
152231,0,0,1710825,0.2791,3.6,11020,27.37,20.81,15092171,Zimbabwe2022-01-01
152232,0,0,1713676,0.2561,3.9,6595,27.39,20.83,15092171,Zimbabwe2022-01-02
152233,0,0,1713676,0.0000,0.0,7062,27.41,20.86,15092171,Zimbabwe2022-01-03


In [57]:
# Mergeo con el dataset de test
df = pd.merge(left=df,right=df_ex, left_on= 'idmerge', right_on='idmerge')
df

Unnamed: 0,country,date,totalConfirmed,totalDeaths,confirmedDay,deathsDay,latitude,longitude,continentExp,idmerge,icuPatients,hospPatients,totalTests,positiveRate,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population
0,Afghanistan,2020-03-23,40,1,6,1,34.516667,69.183333,Asia,Afghanistan2020-03-23,0,0,0,0.0000,0.0,0,0.00,0.00,39835428
1,Afghanistan,2020-03-26,80,2,6,1,34.516667,69.183333,Asia,Afghanistan2020-03-26,0,0,0,0.0000,0.0,0,0.00,0.00,39835428
2,Afghanistan,2020-03-29,114,4,8,2,34.516667,69.183333,Asia,Afghanistan2020-03-29,0,0,0,0.0000,0.0,0,0.00,0.00,39835428
3,Afghanistan,2020-04-03,269,5,34,1,34.516667,69.183333,Asia,Afghanistan2020-04-03,0,0,0,0.0000,0.0,0,0.00,0.00,39835428
4,Afghanistan,2020-04-05,299,7,29,2,34.516667,69.183333,Asia,Afghanistan2020-04-05,0,0,0,0.0000,0.0,0,0.00,0.00,39835428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62815,Zimbabwe,2021-12-30,211728,4997,4180,57,-17.816667,31.033333,Africa,Zimbabwe2021-12-30,0,0,1701080,0.2957,3.4,0,27.25,20.68,15092171
62816,Zimbabwe,2021-12-31,213258,5004,1530,7,-17.816667,31.033333,Africa,Zimbabwe2021-12-31,0,0,1707232,0.2858,3.5,0,27.33,20.77,15092171
62817,Zimbabwe,2022-01-01,214214,5017,956,13,-17.816667,31.033333,Africa,Zimbabwe2022-01-01,0,0,1710825,0.2791,3.6,11020,27.37,20.81,15092171
62818,Zimbabwe,2022-01-03,216087,5047,1873,30,-17.816667,31.033333,Africa,Zimbabwe2022-01-03,0,0,1713676,0.0000,0.0,7062,27.41,20.86,15092171


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62820 entries, 0 to 62819
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   country                    62820 non-null  object        
 1   date                       62820 non-null  datetime64[ns]
 2   totalConfirmed             62820 non-null  int64         
 3   totalDeaths                62820 non-null  int64         
 4   confirmedDay               62820 non-null  int64         
 5   deathsDay                  62820 non-null  int64         
 6   latitude                   62820 non-null  float64       
 7   longitude                  62820 non-null  float64       
 8   continentExp               62820 non-null  object        
 9   idmerge                    62820 non-null  object        
 10  icuPatients                62820 non-null  int64         
 11  hospPatients               62820 non-null  int64         
 12  tota

In [59]:
x = df
df_DD = x.drop_duplicates()

print(f"Filas df: {x.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = x.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = x.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
print(x.isnull().sum())


Filas df: 62820
Filas df sin duplicados: 62820
Hay 0 filas duplicadas
Hay 0 registros nulos en total
country                      0
date                         0
totalConfirmed               0
totalDeaths                  0
confirmedDay                 0
deathsDay                    0
latitude                     0
longitude                    0
continentExp                 0
idmerge                      0
icuPatients                  0
hospPatients                 0
totalTests                   0
positiveRate                 0
testsPerCase                 0
newVaccinations              0
vaccinatedPerHundred         0
fullyVaccinatedPerHundred    0
population                   0
dtype: int64


Control de calidad ok

## 7. Limpieza de `df`

In [60]:
print(df.dtypes)

country                              object
date                         datetime64[ns]
totalConfirmed                        int64
totalDeaths                           int64
confirmedDay                          int64
deathsDay                             int64
latitude                            float64
longitude                           float64
continentExp                         object
idmerge                              object
icuPatients                           int64
hospPatients                          int64
totalTests                            int64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                       int64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                            int64
dtype: object


In [61]:
print(len(df['country'].unique()))

170


In [62]:
date_min = df['date'].min()
date_max = df['date'].max()
dias = date_max - date_min
print(date_min,date_max, dias)

2020-01-23 00:00:00 2022-01-04 00:00:00 712 days 00:00:00


In [63]:
df['continentExp'] = df['continentExp'].replace('Australia', 'Oceania')

In [64]:
result = df['continentExp'].unique()
print(list(result))

['Asia', 'Europe', 'Africa', 'North America', 'South America', 'Oceania', 'Central America']


In [65]:
df.columns

Index(['country', 'date', 'totalConfirmed', 'totalDeaths', 'confirmedDay',
       'deathsDay', 'latitude', 'longitude', 'continentExp', 'idmerge',
       'icuPatients', 'hospPatients', 'totalTests', 'positiveRate',
       'testsPerCase', 'newVaccinations', 'vaccinatedPerHundred',
       'fullyVaccinatedPerHundred', 'population'],
      dtype='object')

In [66]:
df = df.drop(['idmerge'], axis=1)

In [67]:
df = df[['date','continentExp', 'country','latitude', 'longitude','totalConfirmed', 'totalDeaths', 'confirmedDay',
       'deathsDay','icuPatients','hospPatients', 'totalTests', 'positiveRate', 'testsPerCase',
       'newVaccinations', 'vaccinatedPerHundred', 'fullyVaccinatedPerHundred','population']]

In [68]:
df = df.rename(columns={'continentExp':'continent'})

## 8. Extracion de columnas temporales de `'date'`

In [69]:
df.dtypes

date                         datetime64[ns]
continent                            object
country                              object
latitude                            float64
longitude                           float64
totalConfirmed                        int64
totalDeaths                           int64
confirmedDay                          int64
deathsDay                             int64
icuPatients                           int64
hospPatients                          int64
totalTests                            int64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                       int64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                            int64
dtype: object

In [70]:
# Declaro variables
y = df['date'].dt
x = df['date'].dt.isocalendar().week.apply(np.int64)

# Creo nuevas columnas con tipo int64
df['Year'] = y.year
df['Month'] = y.month
df['Week'] = x
df['Day'] = y.day

def str_fixer(value):
    if int(value) < 10:
        return f'0{value}'
    else:
        return str(value)

df["Week"] = df["Week"].apply(str_fixer)

df["Year-Week"] = df["Year"].apply(str) + "-" + df["Week"]


In [71]:
b = list(df["Year-Week"].unique())
b.sort()
b

['2020-04',
 '2020-05',
 '2020-06',
 '2020-07',
 '2020-08',
 '2020-09',
 '2020-10',
 '2020-11',
 '2020-12',
 '2020-13',
 '2020-14',
 '2020-15',
 '2020-16',
 '2020-17',
 '2020-18',
 '2020-19',
 '2020-20',
 '2020-21',
 '2020-22',
 '2020-23',
 '2020-24',
 '2020-25',
 '2020-26',
 '2020-27',
 '2020-28',
 '2020-29',
 '2020-30',
 '2020-31',
 '2020-32',
 '2020-33',
 '2020-34',
 '2020-35',
 '2020-36',
 '2020-37',
 '2020-38',
 '2020-39',
 '2020-40',
 '2020-41',
 '2020-42',
 '2020-43',
 '2020-44',
 '2020-45',
 '2020-46',
 '2020-47',
 '2020-48',
 '2020-49',
 '2020-50',
 '2020-51',
 '2020-52',
 '2020-53',
 '2021-01',
 '2021-02',
 '2021-03',
 '2021-04',
 '2021-05',
 '2021-06',
 '2021-07',
 '2021-08',
 '2021-09',
 '2021-10',
 '2021-11',
 '2021-12',
 '2021-13',
 '2021-14',
 '2021-15',
 '2021-16',
 '2021-17',
 '2021-18',
 '2021-19',
 '2021-20',
 '2021-21',
 '2021-22',
 '2021-23',
 '2021-24',
 '2021-25',
 '2021-26',
 '2021-27',
 '2021-28',
 '2021-29',
 '2021-30',
 '2021-31',
 '2021-32',
 '2021-33',
 '20

In [72]:
# comprobamos que efectivamente las columnas se han creado como int64
df.dtypes

date                         datetime64[ns]
continent                            object
country                              object
latitude                            float64
longitude                           float64
totalConfirmed                        int64
totalDeaths                           int64
confirmedDay                          int64
deathsDay                             int64
icuPatients                           int64
hospPatients                          int64
totalTests                            int64
positiveRate                        float64
testsPerCase                        float64
newVaccinations                       int64
vaccinatedPerHundred                float64
fullyVaccinatedPerHundred           float64
population                            int64
Year                                  int64
Month                                 int64
Week                                 object
Day                                   int64
Year-Week                       

In [73]:
df['date'].max()

Timestamp('2022-01-04 00:00:00')

In [74]:
df['date'].min()

Timestamp('2020-01-23 00:00:00')

In [75]:
df = df.set_index('date')

In [76]:
df

Unnamed: 0_level_0,continent,country,latitude,longitude,totalConfirmed,totalDeaths,confirmedDay,deathsDay,icuPatients,hospPatients,...,testsPerCase,newVaccinations,vaccinatedPerHundred,fullyVaccinatedPerHundred,population,Year,Month,Week,Day,Year-Week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-23,Asia,Afghanistan,34.516667,69.183333,40,1,6,1,0,0,...,0.0,0,0.00,0.00,39835428,2020,3,13,23,2020-13
2020-03-26,Asia,Afghanistan,34.516667,69.183333,80,2,6,1,0,0,...,0.0,0,0.00,0.00,39835428,2020,3,13,26,2020-13
2020-03-29,Asia,Afghanistan,34.516667,69.183333,114,4,8,2,0,0,...,0.0,0,0.00,0.00,39835428,2020,3,13,29,2020-13
2020-04-03,Asia,Afghanistan,34.516667,69.183333,269,5,34,1,0,0,...,0.0,0,0.00,0.00,39835428,2020,4,14,3,2020-14
2020-04-05,Asia,Afghanistan,34.516667,69.183333,299,7,29,2,0,0,...,0.0,0,0.00,0.00,39835428,2020,4,14,5,2020-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-30,Africa,Zimbabwe,-17.816667,31.033333,211728,4997,4180,57,0,0,...,3.4,0,27.25,20.68,15092171,2021,12,52,30,2021-52
2021-12-31,Africa,Zimbabwe,-17.816667,31.033333,213258,5004,1530,7,0,0,...,3.5,0,27.33,20.77,15092171,2021,12,52,31,2021-52
2022-01-01,Africa,Zimbabwe,-17.816667,31.033333,214214,5017,956,13,0,0,...,3.6,11020,27.37,20.81,15092171,2022,1,52,1,2022-52
2022-01-03,Africa,Zimbabwe,-17.816667,31.033333,216087,5047,1873,30,0,0,...,0.0,7062,27.41,20.86,15092171,2022,1,01,3,2022-01


## 9. Exportacion `df` to `.csv`

In [77]:
df.to_csv('../CovidDB.csv')