# Data Cleaning

## 1. Introduccion

## 2. Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## 3. Creacion de `df` utilizando datos de `CoreCode` en `data_core/`

### 3.1. Preparacion dataset `confirmed_global.csv`

In [2]:
#url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
#df1 = pd.read_csv(url_confirmed_global)
df1 = pd.read_csv('data_core/confirmed_global.csv')

In [3]:
# El analisis se va a hacer por pais, no por provincia de modo que elimino la columna 'Province/State'. Las columnas de 'Lat' y 'Long' 
# se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el 'groupby'.

df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

In [4]:
# Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos 
# los casos por pais que anteriormente estaban subdivididos por 'Province/State'.

# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [5]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/19/21,12/20/21,12/21/21,12/22/21,12/23/21,12/24/21,12/25/21,12/26/21,12/27/21,12/28/21
16,Austria,0,0,0,0,0,0,0,0,0,...,1249641,1251433,1253961,1256230,1258377,1260751,1262836,1264553,1266103,1268519


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/24/21          1260751
12/25/21          1262836
12/26/21          1264553
12/27/21          1266103
12/28/21          1268519
Length: 708, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

<div align="center">
Confirmamos que el groupby se ha completado con exito
<div>

In [8]:
# Mergeamos las columnas de 'Date-Countrty' por cada pais y anadimos una columna con su valor correspondiente

# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,1680985
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,469452
12/28/21Yemen,Yemen,12/28/21,10123
12/28/21Zambia,Zambia,12/28/21,238383


### 3.2. Preparacion dataset `deaths_global.csv`

Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
#url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
#df2 = pd.read_csv(url_deaths_global)
df2 = pd.read_csv('data_core/deaths_global.csv')

df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,31632
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,4912
12/28/21Yemen,Yemen,12/28/21,1984
12/28/21Zambia,Zambia,12/28/21,3716


### 3.3. Juntamos todos los dataframe `df1`, `df2` y `df3` en uno solo `df`

In [10]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

In [11]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y'], axis=1)

# Reordeno las Columnas
df = df.rename(columns={'Country/Region_x':'country', 'Date_x':'date','Confirmed':'confirmed','Deaths':'deaths'})
df = df[['country','date','confirmed','deaths']]

In [12]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

Filas df: 138572
Filas df sin duplicados: 138572
Hay 0 filas duplicadas


In [13]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values(['country','date'], ascending=[True, True])
df = df.reset_index(drop=True)

In [14]:
df['confirmedDay'] = df['confirmed'].diff().fillna(0).astype(int)
df['deathsDay'] = df['deaths'].diff().fillna(0).astype(int)
df

Unnamed: 0,country,date,confirmed,deaths,confirmedDay,deathsDay
0,Afghanistan,2020-01-22,0,0,0,0
1,Afghanistan,2020-01-23,0,0,0,0
2,Afghanistan,2020-01-24,0,0,0,0
3,Afghanistan,2020-01-25,0,0,0,0
4,Afghanistan,2020-01-26,0,0,0,0
...,...,...,...,...,...,...
138567,Zimbabwe,2021-12-24,202736,4871,1392,16
138568,Zimbabwe,2021-12-25,203746,4885,1010,14
138569,Zimbabwe,2021-12-26,204351,4891,605,6
138570,Zimbabwe,2021-12-27,205449,4908,1098,17


In [15]:
df.dtypes

country                 object
date            datetime64[ns]
confirmed                int64
deaths                   int64
confirmedDay             int64
deathsDay                int64
dtype: object

## 4. Anado datos geograficos y poblacion a `df`

### 4.1. Importancion de datos 

In [16]:
df4 = pd.read_csv("data_extra/concap.csv")

### 4.2. Preaparacion del dataframe

In [17]:
df4 = df4.rename(columns={'CountryName':'country',
                          'CapitalLatitude':'latitude', 
                          'CapitalLongitude':'longitude', 
                          'CountryCode':'geoId',
                          'ContinentName':'continentExp'})
df4

Unnamed: 0,country,CapitalName,latitude,longitude,geoId,continentExp
0,Somaliland,Hargeisa,9.550000,44.050000,,Africa
1,South Georgia and South Sandwich Islands,King Edward Point,-54.283333,-36.500000,GS,Antarctica
2,French Southern and Antarctic Lands,Port-aux-Français,-49.350000,70.216667,TF,Antarctica
3,Palestine,Jerusalem,31.766667,35.233333,PS,Asia
4,Aland Islands,Mariehamn,60.116667,19.900000,AX,Europe
...,...,...,...,...,...,...
240,Northern Cyprus,North Nicosia,35.183333,33.366667,,Europe
241,Hong Kong,,0.000000,0.000000,HK,Asia
242,Heard Island and McDonald Islands,,0.000000,0.000000,HM,Antarctica
243,British Indian Ocean Territory,Diego Garcia,-7.300000,72.400000,IO,Africa


In [18]:
df_DD = df4.drop_duplicates()

print(f"Filas df4: {df4.shape[0]}\nFilas df4 sin duplicados: {df_DD.shape[0]}")

n_duplicados = df4.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")

df4 = df4.dropna()
n_null = df4.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df4: 245
Filas df4 sin duplicados: 245
Hay 0 filas duplicadas
Hay 0 registros nulos en total


Control de calidad OK

In [19]:
df = pd.merge(df, df4 , how='left', on='country')
df

Unnamed: 0,country,date,confirmed,deaths,confirmedDay,deathsDay,CapitalName,latitude,longitude,geoId,continentExp
0,Afghanistan,2020-01-22,0,0,0,0,Kabul,34.516667,69.183333,AF,Asia
1,Afghanistan,2020-01-23,0,0,0,0,Kabul,34.516667,69.183333,AF,Asia
2,Afghanistan,2020-01-24,0,0,0,0,Kabul,34.516667,69.183333,AF,Asia
3,Afghanistan,2020-01-25,0,0,0,0,Kabul,34.516667,69.183333,AF,Asia
4,Afghanistan,2020-01-26,0,0,0,0,Kabul,34.516667,69.183333,AF,Asia
...,...,...,...,...,...,...,...,...,...,...,...
138567,Zimbabwe,2021-12-24,202736,4871,1392,16,Harare,-17.816667,31.033333,ZW,Africa
138568,Zimbabwe,2021-12-25,203746,4885,1010,14,Harare,-17.816667,31.033333,ZW,Africa
138569,Zimbabwe,2021-12-26,204351,4891,605,6,Harare,-17.816667,31.033333,ZW,Africa
138570,Zimbabwe,2021-12-27,205449,4908,1098,17,Harare,-17.816667,31.033333,ZW,Africa


### 4.3. Filtro `df` por paises de `Europa`

In [20]:
filter_europe = df['continentExp'] == 'Europe'
df = df[filter_europe]

In [21]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas
Hay 0 registros nulos en total


Control de calidad OK

In [22]:
df = df.drop(['continentExp'], axis=1)
df

Unnamed: 0,country,date,confirmed,deaths,confirmedDay,deathsDay,CapitalName,latitude,longitude,geoId
707,Albania,2020-01-22,0,0,-157998,-7355,Tirana,41.316667,19.816667,AL
708,Albania,2020-01-23,0,0,0,0,Tirana,41.316667,19.816667,AL
709,Albania,2020-01-24,0,0,0,0,Tirana,41.316667,19.816667,AL
710,Albania,2020-01-25,0,0,0,0,Tirana,41.316667,19.816667,AL
711,Albania,2020-01-26,0,0,0,0,Tirana,41.316667,19.816667,AL
...,...,...,...,...,...,...,...,...,...,...
132204,United Kingdom,2021-12-24,11958841,148324,121932,137,London,51.500000,-0.083333,GB
132205,United Kingdom,2021-12-25,11958928,148324,87,0,London,51.500000,-0.083333,GB
132206,United Kingdom,2021-12-26,11958928,148324,0,0,London,51.500000,-0.083333,GB
132207,United Kingdom,2021-12-27,12277814,148470,318886,146,London,51.500000,-0.083333,GB


Dataframe prepardo para mergearlo y ser enriquecido

## 5. Union del dataset: `owid-covid-data.csv` con `df`

### 5.1. Creacion de `id` en `df` para el mergeo

In [23]:
# Defino la columna que me serviran para mergear con otros dataset
df['id-merge'] = df['country'] + df['date'].apply(str)
df['id-merge'] = df['id-merge'].apply(lambda x: x.split(' ')[0])
df['id-merge']

707       Albania2020-01-22
708       Albania2020-01-23
709       Albania2020-01-24
710       Albania2020-01-25
711       Albania2020-01-26
                ...        
132204               United
132205               United
132206               United
132207               United
132208               United
Name: id-merge, Length: 33229, dtype: object

In [24]:
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")

Filas df: 33229
Filas df sin duplicados: 33229
Hay 0 filas duplicadas
Hay 0 registros nulos en total


### 5.1. Preaparcion del dataset `owid-covid-data.csv`

#### 5.1.1. Importacion de datos

In [25]:
#url_confirmed_global = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
#df_ex = pd.read_csv(url_confirmed_global)
df_ex = pd.read_csv('data_extra/owid-covid-data.csv')

#### 5.1.2. Extracion de columnas

In [26]:
df_ex.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [27]:
df_ex = df_ex.drop(df_ex.columns.difference(['continent','location', 'date', 'icu_patients','hosp_patients',
                                          'total_tests','positive_rate','tests_per_case',
                                          'new_vaccinations','people_vaccinated_per_hundred',
                                          'people_fully_vaccinated_per_hundred','population']), axis=1)


In [28]:
df_ex

Unnamed: 0,continent,location,date,icu_patients,hosp_patients,total_tests,positive_rate,tests_per_case,new_vaccinations,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,population
0,Asia,Afghanistan,2020-02-24,,,,,,,,,39835428.0
1,Asia,Afghanistan,2020-02-25,,,,,,,,,39835428.0
2,Asia,Afghanistan,2020-02-26,,,,,,,,,39835428.0
3,Asia,Afghanistan,2020-02-27,,,,,,,,,39835428.0
4,Asia,Afghanistan,2020-02-28,,,,,,,,,39835428.0
...,...,...,...,...,...,...,...,...,...,...,...,...
151543,Africa,Zimbabwe,2021-12-28,,,1686251.0,0.3388,3.0,3903.0,27.21,20.64,15092171.0
151544,Africa,Zimbabwe,2021-12-29,,,1693985.0,0.2550,3.9,11952.0,27.25,20.68,15092171.0
151545,Africa,Zimbabwe,2021-12-30,,,1701080.0,0.2957,3.4,,,,15092171.0
151546,Africa,Zimbabwe,2021-12-31,,,,,,,27.33,20.77,15092171.0


## 6. Enriquecimiento de `df`

In [29]:
#df.columns

In [30]:
# Mergeo con el dataset de test
#df = pd.merge(df, df_ex, how='left', on='id-merge')
#df

In [31]:
''' 
df_DD = df.drop_duplicates()

print(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = df.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = df.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
'''

' \ndf_DD = df.drop_duplicates()\n\nprint(f"Filas df: {df.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")\nn_duplicados = df.shape[0] - df_DD.shape[0]\nprint(f"Hay {n_duplicados} filas duplicadas")\nn_null = df.isnull().sum().sum()\nprint(f"Hay {n_null} registros nulos en total")\n'

## 8. Dataframe final `df`

### 7.3. Rellenando valores nulos

In [32]:
'''
# con este filtro voy a eliminar varios paises que no aparecen directamente en mis datos enriquecidos 
filter_Confirmed_0 = df['population'] <= 0
df = df[filter_Confirmed_0]
df
'''

"\n# con este filtro voy a eliminar varios paises que no aparecen directamente en mis datos enriquecidos \nfilter_Confirmed_0 = df['population'] <= 0\ndf = df[filter_Confirmed_0]\ndf\n"

In [33]:
'''
df['positivity_rate'] = df['positivity_rate'].fillna(0)

df['FirstDose'] = df['FirstDose'].fillna(0)
df['DosesReceived'] = df['DosesReceived'].fillna(0)
df['DosesExported'] = df['DosesExported'].fillna(0)

df['Hospital_Occupancy'] = df['Hospital_Occupancy'].fillna(0)
df['ICU_Occupancy'] = df['ICU_Occupancy'].fillna(0)

print((df.isnull().sum()/len(df))*100)
print(df.shape)
'''


"\ndf['positivity_rate'] = df['positivity_rate'].fillna(0)\n\ndf['FirstDose'] = df['FirstDose'].fillna(0)\ndf['DosesReceived'] = df['DosesReceived'].fillna(0)\ndf['DosesExported'] = df['DosesExported'].fillna(0)\n\ndf['Hospital_Occupancy'] = df['Hospital_Occupancy'].fillna(0)\ndf['ICU_Occupancy'] = df['ICU_Occupancy'].fillna(0)\n\nprint((df.isnull().sum()/len(df))*100)\nprint(df.shape)\n"

## 8. Exportacion `df` a `.csv`

In [34]:
'''
#Cambio nombre a las columnas en un mismo formato camelCase
df = df.rename(columns={'Country':'country',
                        'Lat':'latitude',
                        'Long':'longitude',
                        'Date':'date',
                        'Year':'year',
                        'Month':'month',
                        'Week':'week',
                        'Day':'day',
                        'Year-Week':'yearWeek',
                        'Confirmed':'confirmed',
                        'Deaths':'deaths',
                        'tests_done':'testDone',
                        'testing_rate':'testingRate',
                        'positivity_rate':'positivityRate',
                        'FirstDose':'firstDose',
                        'Hospital_Occupancy':'hospitalOccupancy',
                        'ICU_Occupancy':'IcuOccupancy',
                        })
'''

"\n#Cambio nombre a las columnas en un mismo formato camelCase\ndf = df.rename(columns={'Country':'country',\n                        'Lat':'latitude',\n                        'Long':'longitude',\n                        'Date':'date',\n                        'Year':'year',\n                        'Month':'month',\n                        'Week':'week',\n                        'Day':'day',\n                        'Year-Week':'yearWeek',\n                        'Confirmed':'confirmed',\n                        'Deaths':'deaths',\n                        'tests_done':'testDone',\n                        'testing_rate':'testingRate',\n                        'positivity_rate':'positivityRate',\n                        'FirstDose':'firstDose',\n                        'Hospital_Occupancy':'hospitalOccupancy',\n                        'ICU_Occupancy':'IcuOccupancy',\n                        })\n"

In [35]:
#df = df.reset_index(drop=True)

#df.head(5)

In [36]:
#df['testingRate'] = np.round(df['testingRate'], decimals = 2) 

#df['positivityRate'] = np.round(df['positivityRate'], decimals = 2) 

In [37]:

#df.dtypes


In [38]:
#df.to_csv('df.csv')