# Data Cleaning

### Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re

### 1. Limpieza DataSet `confirmed_global.csv`

In [2]:
df1 = pd.read_csv("confirmed_global.csv")

- 1.1. El analisis se va a hacer por pais, no por provincia de modo que elimino la columna `Province/State`. Las columnas de `Lat` y `Long` se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el `groupby`.

In [3]:
df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

- 1.2. Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos los casos por pais que anteriormente estaban subdivididos por `'Province/State'`.

In [4]:
# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               33
Canada                              16
France                              12
United Kingdom                      12
Australia                            8
Netherlands                          5
Denmark                              3
Panama                               1
Nicaragua                            1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Afghanistan                          1
Papua New Guinea                     1
Paraguay                             1
Peru                                 1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
New Zealand              

In [5]:
df1.loc[df1["Country/Region"] == "Australia"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
8,Australia,0,0,0,0,0,0,0,0,0,...,123,123,123,123,123,123,123,123,123,123
9,Australia,0,0,0,0,3,4,4,4,4,...,5296,5299,5300,5303,5310,5316,5318,5320,5324,5330
10,Australia,0,0,0,0,0,0,0,0,0,...,109,111,112,112,112,112,112,112,112,112
11,Australia,0,0,0,0,0,0,0,1,3,...,1485,1488,1489,1492,1491,1497,1500,1501,1502,1502
12,Australia,0,0,0,0,0,0,0,0,0,...,658,658,659,661,661,662,663,665,665,666
13,Australia,0,0,0,0,0,0,0,0,0,...,234,234,234,234,234,234,234,234,234,234
14,Australia,0,0,0,0,1,1,1,1,2,...,20484,20484,20484,20484,20484,20484,20484,20484,20485,20485
15,Australia,0,0,0,0,0,0,0,0,0,...,944,944,947,948,950,951,951,951,951,953


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Australia"].sum())

Country/Region    Australia
1/22/20                   0
1/23/20                   0
1/24/20                   0
1/25/20                   0
                    ...    
4/6/21                29379
4/7/21                29385
4/8/21                29390
4/9/21                29396
4/10/21               29405
Length: 446, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(list(df1["Country/Region"].value_counts()),print(df1["Country/Region"].value_counts().to_string()))

Afghanistan                         1
Albania                             1
Namibia                             1
Nepal                               1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Mozambique                          1
Morocco                             1
Montenegro                          1
Malaysia    

**Confirmamos que el groupby se ha completado con exito.** 

- 1.3. Mergeamos las columnas de fecha por cada pais y anadimos una columna con su valor correspondiente

In [8]:
# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2692
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,265897
4/10/21Yemen,Yemen,4/10/21,5276
4/10/21Zambia,Zambia,4/10/21,89918


### 2. Limpieza DataSet `deaths_global.csv`

- 2.1. Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
df2 = pd.read_csv("deaths_global.csv")
df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,35
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,2838
4/10/21Yemen,Yemen,4/10/21,1031
4/10/21Zambia,Zambia,4/10/21,1226


### 3. Limpieza DataSet `recovered_global.csv`

- 3.1. Repetimos el mismo proceso anterior para el dataset `recovered_global.csv`

In [10]:
df3 = pd.read_csv("recovered_global.csv")
df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2429
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,231288
4/10/21Yemen,Yemen,4/10/21,2027
4/10/21Zambia,Zambia,4/10/21,86813


### 4. Juntamos todos los dataset en uno solo por el indice Date

In [11]:

# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df_f1 = pd.merge(df1, df2 , how='left', on='Date-Country')

# Creo un sefundo dataframe final, mergeando el anterior dataframe (df_f1) y df3 por 'Date-Country'
df_f2 = pd.merge(df_f1, df3 , how='left', on='Date-Country')

df_f2.head(3)


Unnamed: 0,Country/Region_x,Date_x,Confirmed,Date-Country,Country/Region_y,Date_y,Deaths,Country/Region,Date,Recovered
0,Afghanistan,1/22/20,0,1/22/20Afghanistan,Afghanistan,1/22/20,0,Afghanistan,1/22/20,0
1,Albania,1/22/20,0,1/22/20Albania,Albania,1/22/20,0,Albania,1/22/20,0
2,Algeria,1/22/20,0,1/22/20Algeria,Algeria,1/22/20,0,Algeria,1/22/20,0


In [12]:
#Elimino las columnas duplicadas
df_f2 = df_f2.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)

# Reordeno las Columnas
df_f2 = df_f2[['Country/Region','Date','Confirmed','Deaths','Recovered']]

df_f2.head(3)

Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered
0,Afghanistan,1/22/20,0,0,0
1,Albania,1/22/20,0,0,0
2,Algeria,1/22/20,0,0,0
3,Andorra,1/22/20,0,0,0
4,Angola,1/22/20,0,0,0
...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,35,2429
85436,West Bank and Gaza,4/10/21,265897,2838,231288
85437,Yemen,4/10/21,5276,1031,2027
85438,Zambia,4/10/21,89918,1226,86813


### 5. Cambio de tipo de datos

In [13]:
df_f2.dtypes
df_f2['Date'] = pd.to_datetime(df_f2['Date'])
df_f2.dtypes



Country/Region            object
Date              datetime64[ns]
Confirmed                  int64
Deaths                     int64
Recovered                  int64
dtype: object