# Data Cleaning

### 1. Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re

### 2. Limpieza DataSet `confirmed_global.csv`

In [2]:
df1 = pd.read_csv("confirmed_global.csv")

1. El analisis se va a hacer por pais, no por provincia de modo que elimino la columna `Province/State`

In [3]:
df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

2. Una vez eliminada dicha columna agrupamos los datos a nivel de fila por pais 

In [4]:
df1 = df1.groupby(['Country/Region']).sum().reset_index()

3. Mergeamos las columnas de fecha por cada pais y anadimos una columna con su valor correspondiente

In [5]:
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="confirmed")
df1['Date-Country'] = df1['Date'] + df1['Country/Region']
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2692
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,265897
4/10/21Yemen,Yemen,4/10/21,5276
4/10/21Zambia,Zambia,4/10/21,89918


### 3. Limpieza DataSet `deaths_global.csv`

Repetimos el mismo proceso anterior

In [6]:
df2 = pd.read_csv("deaths_global.csv")
df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,35
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,2838
4/10/21Yemen,Yemen,4/10/21,1031
4/10/21Zambia,Zambia,4/10/21,1226


### 4. Limpieza DataSet `recovered_global.csv`

Repetimos el mismo proceso anterior

In [7]:
df3 = pd.read_csv("recovered_global.csv")
df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2429
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,231288
4/10/21Yemen,Yemen,4/10/21,2027
4/10/21Zambia,Zambia,4/10/21,86813


### 5. Juntamos todos los dataset en uno solo por el indice Date

In [8]:
df_f1 = pd.merge(df1, df2 , how='left', on='Date-Country')
df_f2 = pd.merge(df_f1, df3 , how='left', on='Date-Country')

df_f2 = df_f2.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)
df_f2

df_f2 = df_f2[['Country/Region','Date','confirmed','deaths','recovered']]

df_f2


Unnamed: 0,Country/Region,Date,confirmed,deaths,recovered
0,Afghanistan,1/22/20,0,0,0
1,Albania,1/22/20,0,0,0
2,Algeria,1/22/20,0,0,0
3,Andorra,1/22/20,0,0,0
4,Angola,1/22/20,0,0,0
...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,35,2429
85436,West Bank and Gaza,4/10/21,265897,2838,231288
85437,Yemen,4/10/21,5276,1031,2027
85438,Zambia,4/10/21,89918,1226,86813


In [17]:
df_f2.dtypes
df_f2['Date'] = pd.to_datetime(df_f2['Date'])
df_f2.dtypes



Country/Region            object
Date              datetime64[ns]
confirmed                  int64
deaths                     int64
recovered                  int64
dtype: object