# Data Cleaning

### Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
import datetime

### URL Data [Johns Hopkins University (JHU)](https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases)

### 1. Limpieza DataSet `confirmed_global.csv`

In [2]:
#url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
#df1 = pd.read_csv(url_confirmed_global)
df1 = pd.read_csv('data_core/confirmed_global.csv')

#### 1.1. El analisis se va a hacer por pais, no por provincia de modo que elimino la columna `Province/State`. Las columnas de `Lat` y `Long` se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el `groupby`.

In [3]:
df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

#### 1.2. Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos los casos por pais que anteriormente estaban subdivididos por `'Province/State'`.

In [4]:
# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [5]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/19/21,12/20/21,12/21/21,12/22/21,12/23/21,12/24/21,12/25/21,12/26/21,12/27/21,12/28/21
16,Austria,0,0,0,0,0,0,0,0,0,...,1249641,1251433,1253961,1256230,1258377,1260751,1262836,1264553,1266103,1268519


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/24/21          1260751
12/25/21          1262836
12/26/21          1264553
12/27/21          1266103
12/28/21          1268519
Length: 708, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

**Confirmamos que el groupby se ha completado con exito.** 

#### 1.3. Mergeamos las columnas de fecha por cada pais y anadimos una columna con su valor correspondiente

In [8]:
# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,1680985
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,469452
12/28/21Yemen,Yemen,12/28/21,10123
12/28/21Zambia,Zambia,12/28/21,238383


### 2. Limpieza DataSet `deaths_global.csv`

- 2.1. Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
#url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
#df2 = pd.read_csv(url_deaths_global)
df2 = pd.read_csv('data_core/deaths_global.csv')

df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,31632
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,4912
12/28/21Yemen,Yemen,12/28/21,1984
12/28/21Zambia,Zambia,12/28/21,3716


### 3. Limpieza DataSet `recovered_global.csv`

- 3.1. Repetimos el mismo proceso anterior para el dataset `recovered_global.csv`

In [10]:
#url_recovered_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv"
#df3 = pd.read_csv(url_recovered_global)
df3 = pd.read_csv('data_core/recovered_global.csv')


df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/28/21Vietnam,Vietnam,12/28/21,0
12/28/21West Bank and Gaza,West Bank and Gaza,12/28/21,0
12/28/21Yemen,Yemen,12/28/21,0
12/28/21Zambia,Zambia,12/28/21,0


### 4. Juntamos todos los dataset (df1, df2 y df3) en uno solo (df) por el indice Date

In [11]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

# Creo un sefundo dataframe final, mergeando el anterior dataframe (df_f1) y df3 por 'Date-Country'
df = pd.merge(df, df3 , how='left', on='Date-Country')


In [12]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)

# Reordeno las Columnas
df = df[['Country/Region','Date','Confirmed','Deaths','Recovered']]
df = df.rename(columns={'Country/Region':'Country'})

### 5. Anado las columnas de "Lat" y "Long" al dataframe df

In [13]:
df4 = pd.read_csv("data_extra/concap.csv")
df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)
df4 = df4.drop_duplicates()

df4 = df4.rename(columns={'CountryName':'Country',
                          'CapitalLatitude':'Lat', 
                          'CapitalLongitude':'Long', 
                          'CountryCode':'geoId'})
df4


  df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)


Unnamed: 0,Country,Lat,Long,geoId
0,Somaliland,9.550000,44.050000,
1,South Georgia and South Sandwich Islands,-54.283333,-36.500000,GS
2,French Southern and Antarctic Lands,-49.350000,70.216667,TF
3,Palestine,31.766667,35.233333,PS
4,Aland Islands,60.116667,19.900000,AX
...,...,...,...,...
240,Northern Cyprus,35.183333,33.366667,
241,Hong Kong,0.000000,0.000000,HK
242,Heard Island and McDonald Islands,0.000000,0.000000,HM
243,British Indian Ocean Territory,-7.300000,72.400000,IO


In [14]:
df = pd.merge(df, df4 , how='left', on='Country')

### 6. Anado las columnas de informacion geografica de los paises al dataframe (df)

In [15]:
#url_dayly_covid = "https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv/data.csv"
#df5 = pd.read_csv(url_dayly_covid)

df5 = pd.read_csv('data_extra/Cases-Death-Country-TimeSeries.csv')
df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)
df5 = df5.drop_duplicates()

  df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)


In [16]:
df = pd.merge(df, df5 , how='left', on='geoId')
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp
0,Afghanistan,1/22/20,0,0,0,34.516667,69.183333,AF,,
1,Albania,1/22/20,0,0,0,41.316667,19.816667,AL,,
2,Algeria,1/22/20,0,0,0,36.750000,3.050000,DZ,,
3,Andorra,1/22/20,0,0,0,42.500000,1.516667,AD,,
4,Angola,1/22/20,0,0,0,-8.833333,13.216667,AO,,
...,...,...,...,...,...,...,...,...,...,...
138567,Vietnam,12/28/21,1680985,31632,0,21.033333,105.850000,VN,,
138568,West Bank and Gaza,12/28/21,469452,4912,0,,,,,
138569,Yemen,12/28/21,10123,1984,0,15.350000,44.200000,YE,,
138570,Zambia,12/28/21,238383,3716,0,-15.416667,28.283333,ZM,,


### 7. Filtro todo el dataframe por continentExP: "Europe"

Para poder aprovechar los dataset de data_extra, que estan centrados unicamente en Europa, y ademas poder centrar mejor el analisis, voy a filtrar el dataframe eliminando todos los paises que no son europeos.

In [17]:
# Filtro el dataframe (df) para paises europeos 
filter_europe = df['continentExp'] == 'Europe'
df = df[filter_europe]

### 8. Cambio de tipo de datos e indice

In [18]:
df.dtypes

Country          object
Date             object
Confirmed         int64
Deaths            int64
Recovered         int64
Lat             float64
Long            float64
geoId            object
popData2020     float64
continentExp     object
dtype: object

In [19]:
# Cabia Date a tipo fecha
df['Date'] = pd.to_datetime(df.Date)
df['DateCopy'] = df['Date']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df.Date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DateCopy'] = df['Date']


In [20]:
# Utilizo la columna Date como indice
df.set_index('Date', inplace=True)

### 9. Extraccion de nuevas columnas a traves del indice `Date`

In [21]:
'''
def atributos_fecha (data):
    data['Year'] = data.index.year
    data['Month'] = data.index.month
    data['Week'] = data.index.week
    data['Week-Copy'] = data.index.week
    data['Day'] = data.index.day
    return(data)
    '''
#atributos_fecha(df)

"\ndef atributos_fecha (data):\n    data['Year'] = data.index.year\n    data['Month'] = data.index.month\n    data['Week'] = data.index.week\n    data['Week-Copy'] = data.index.week\n    data['Day'] = data.index.day\n    return(data)\n    "

In [22]:
#df['year'] = pd.DatetimeIndex(df['Joined date']).year
df['Year'] = pd.DatetimeIndex(df['DateCopy']).year
df['Month'] = pd.DatetimeIndex(df['DateCopy']).month
df['Week'] = pd.DatetimeIndex(df['DateCopy']).week
df['Week-Copy'] = pd.DatetimeIndex(df['DateCopy']).week
df['Day'] = pd.DatetimeIndex(df['DateCopy']).day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.DatetimeIndex(df['DateCopy']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = pd.DatetimeIndex(df['DateCopy']).month
  df['Week'] = pd.DatetimeIndex(df['DateCopy']).week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week'] = pd.DatetimeIndex(df['DateCopy']).we

In [23]:
df['Year'] = df['Year'].astype({"Year": str})
df['Week'] = df['Week'].astype({"Week": str})
df['Week-Copy'] = df['Week'].astype({"Week": str})
df['Day'] = df['Day'].astype({"Day": str})
df['Month'] = df['Month'].astype({"Month": str})
df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Year'].astype({"Year": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week-Copy'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFr

Country                 object
Confirmed                int64
Deaths                   int64
Recovered                int64
Lat                    float64
Long                   float64
geoId                   object
popData2020            float64
continentExp            object
DateCopy        datetime64[ns]
Year                    object
Month                   object
Week                    object
Week-Copy               object
Day                     object
dtype: object

In [24]:
df["Week-Copy"] = df["Week-Copy"].replace('1','01')
df["Week-Copy"] = df["Week-Copy"].replace('2','02')
df["Week-Copy"] = df["Week-Copy"].replace('3','03')
df["Week-Copy"] = df["Week-Copy"].replace('4','04')
df["Week-Copy"] = df["Week-Copy"].replace('6','06')
df["Week-Copy"] = df["Week-Copy"].replace('7','08')
df["Week-Copy"] = df["Week-Copy"].replace('8','08')
df["Week-Copy"] = df["Week-Copy"].replace('9','09')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('1','01')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('2','02')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('3','03')
A value is trying to be set on a copy of a slice 

In [25]:
df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]


### 10. Preaparo el dataframe (df) para el mergeo con `data_extra`. Ordeno las columnas y elimino las columnas sobrantes.  

In [26]:
print(df.columns)
df = df.drop(['continentExp'], axis=1)

# Reordeno las Columnas
df = df[['DateCopy','Country','geoId','Lat','Long','Year','Month','Week','Day','popData2020','Confirmed','Deaths',
         'Recovered','Week-Copy','Year-Week','Year-Week-Copy']]

df = df.rename(columns={'popData2020':'Population'})\
    
df['Population'] = df['Population'].astype(int)
df['DateCopy'] = df['DateCopy'].astype(str)

df['id-merge'] = df['geoId']+df['Year-Week-Copy']
df['id-merge-country-date'] = df['Country']+df['DateCopy']

df['DateCopy'] = pd.to_datetime(df.DateCopy)


Index(['Country', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long', 'geoId',
       'popData2020', 'continentExp', 'DateCopy', 'Year', 'Month', 'Week',
       'Week-Copy', 'Day', 'Year-Week', 'Year-Week-Copy'],
      dtype='object')


### 11. Limpieza DataSet ` ICU_hospital.csv`

In [27]:
#url_UCI = "https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv"
#df_ex3 = pd.read_csv(url_UCI)
df_ex3 = pd.read_csv('data_extra/ICU_hospital.csv')

In [28]:
df_ex3.head(10)

Unnamed: 0,country,indicator,date,year_week,value,source,url
0,Austria,Daily hospital occupancy,2020-04-01,2020-W14,856.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
1,Austria,Daily hospital occupancy,2020-04-02,2020-W14,823.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
2,Austria,Daily hospital occupancy,2020-04-03,2020-W14,829.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
3,Austria,Daily hospital occupancy,2020-04-04,2020-W14,826.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
4,Austria,Daily hospital occupancy,2020-04-05,2020-W14,712.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
5,Austria,Daily hospital occupancy,2020-04-06,2020-W15,824.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
6,Austria,Daily hospital occupancy,2020-04-07,2020-W15,857.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
7,Austria,Daily hospital occupancy,2020-04-08,2020-W15,829.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
8,Austria,Daily hospital occupancy,2020-04-09,2020-W15,820.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...
9,Austria,Daily hospital occupancy,2020-04-10,2020-W15,771.0,Country_Website,https://covid19-dashboard.ages.at/dashboard_Ho...


In [29]:
print((df_ex3.isnull().sum()/len(df_ex3))*100)

country      0.000000
indicator    0.000000
date         0.000000
year_week    0.000000
value        0.000000
source       0.000000
url          9.862196
dtype: float64


In [30]:
df_ex3['indicator'].unique()

array(['Daily hospital occupancy', 'Daily ICU occupancy',
       'Weekly new hospital admissions per 100k',
       'Weekly new ICU admissions per 100k'], dtype=object)

In [31]:
# Divido el dataset en 4 dataset por indicator y luego los uno en uno solo por columnas
df_ex3['id-merge-country-date'] = df_ex3['country']+df_ex3['date']
df_ex3 = df_ex3.drop(['year_week', 'source', 'url'], axis=1)

In [32]:
# Hospital_Occupancy

filter_uci = df_ex3['indicator'] == 'Daily hospital occupancy'

# DEFINO EL DATAFRAME
df_uci_1 = df_ex3[filter_uci]
df_uci_1 = df_uci_1.rename(columns={'value':'Hospital_Occupancy'})
df_uci_1 = df_uci_1.drop(['indicator','date','country'], axis=1)

df_uci_1

Unnamed: 0,Hospital_Occupancy,id-merge-country-date
0,856.0,Austria2020-04-01
1,823.0,Austria2020-04-02
2,829.0,Austria2020-04-03
3,826.0,Austria2020-04-04
4,712.0,Austria2020-04-05
...,...,...
30733,496.0,Sweden2021-12-15
30734,526.0,Sweden2021-12-16
30735,524.0,Sweden2021-12-17
30736,513.0,Sweden2021-12-18


In [33]:
# ICU occupancy

filter_uci = df_ex3['indicator'] == 'Daily ICU occupancy'

#DEFINO EL DATAFRAME
df_uci_2 = df_ex3[filter_uci]
df_uci_2 = df_uci_2.rename(columns={'value':'ICU_Occupancy'})
df_uci_2 = df_uci_2.drop(['indicator','date','country'], axis=1)

df_uci_2

Unnamed: 0,ICU_Occupancy,id-merge-country-date
628,215.0,Austria2020-04-01
629,219.0,Austria2020-04-02
630,245.0,Austria2020-04-03
631,245.0,Austria2020-04-04
632,244.0,Austria2020-04-05
...,...,...
31395,75.0,Sweden2021-12-15
31396,70.0,Sweden2021-12-16
31397,68.0,Sweden2021-12-17
31398,72.0,Sweden2021-12-18


In [34]:
df_ex3 = df_ex3.drop(['indicator','value','country'], axis=1)
df_ex3

Unnamed: 0,date,id-merge-country-date
0,2020-04-01,Austria2020-04-01
1,2020-04-02,Austria2020-04-02
2,2020-04-03,Austria2020-04-03
3,2020-04-04,Austria2020-04-04
4,2020-04-05,Austria2020-04-05
...,...,...
31489,2021-11-21,Sweden2021-11-21
31490,2021-11-28,Sweden2021-11-28
31491,2021-12-05,Sweden2021-12-05
31492,2021-12-12,Sweden2021-12-12


In [35]:
df_ex3 = pd.merge(df_ex3, df_uci_1 , how='left', on='id-merge-country-date')
df_ex3 = pd.merge(df_ex3, df_uci_2 , how='left', on='id-merge-country-date')
df_ex3 = df_ex3.drop(columns=['date'])
df_ex3

Unnamed: 0,id-merge-country-date,Hospital_Occupancy,ICU_Occupancy
0,Austria2020-04-01,856.0,215.0
1,Austria2020-04-02,823.0,219.0
2,Austria2020-04-03,829.0,245.0
3,Austria2020-04-04,826.0,245.0
4,Austria2020-04-05,712.0,244.0
...,...,...,...
31489,Sweden2021-11-21,272.0,31.0
31490,Sweden2021-11-28,293.0,29.0
31491,Sweden2021-12-05,344.0,46.0
31492,Sweden2021-12-12,451.0,53.0


### 11. Limpieza DataSet `test_rate.csv`

In [36]:
#url_test_rate = "https://opendata.ecdc.europa.eu/covid19/testing/csv/data.csv"
#df_ex1 = pd.read_csv(url_test_rate)
df_ex1 = pd.read_csv('data_extra/test_rate.csv')

In [37]:
df_ex1

Unnamed: 0,country,country_code,year_week,level,region,region_name,new_cases,tests_done,population,testing_rate,positivity_rate,testing_data_source
0,Austria,AT,2020-W15,national,AT,Austria,1838,12339,8901064.0,138.623877,14.895859,Manual webscraping
1,Austria,AT,2020-W16,national,AT,Austria,684,58488,8901064.0,657.089984,1.169471,Manual webscraping
2,Austria,AT,2020-W17,national,AT,Austria,448,33443,8901064.0,375.719128,1.339593,Manual webscraping
3,Austria,AT,2020-W18,national,AT,Austria,312,26598,8901064.0,298.818209,1.173021,Country website
4,Austria,AT,2020-W19,national,AT,Austria,264,42153,8901064.0,473.572598,0.626290,Country website
...,...,...,...,...,...,...,...,...,...,...,...,...
11750,Sweden,SE,2021-W46,national,SE,Sweden,7095,123920,10327589.0,1199.892831,5.725468,TESSy
11751,Sweden,SE,2021-W47,national,SE,Sweden,11916,226289,10327589.0,2191.111594,5.265833,TESSy
11752,Sweden,SE,2021-W48,national,SE,Sweden,13802,273987,10327589.0,2652.961887,5.037465,TESSy
11753,Sweden,SE,2021-W49,national,SE,Sweden,18659,335956,10327589.0,3252.995447,5.554001,TESSy


In [38]:
df_ex1['level'].unique()

array(['national', 'subnational'], dtype=object)

In [39]:
# Dentro del dataset exiten datos a nivel nacional y subnacional. Vamos a filtrar por nacional, para descartar 
# todas las lineas por provicia, ya que nuestro analisis es a nivel nacional en Europa.

filter_national = df_ex1['level'] == 'national'
df_ex1 = df_ex1[filter_national]

In [40]:
# Eliminamos columnas no necesarias 

df_ex1 = df_ex1.drop(['region_name', 'new_cases', 'testing_data_source','population','region_name','level','region'], axis=1)

In [41]:
df_ex1['id-merge'] = df_ex1['country_code'] + df_ex1['year_week']
df_ex1 = df_ex1.drop(['year_week', 'country_code', 'country'], axis=1)
df_ex1

Unnamed: 0,tests_done,testing_rate,positivity_rate,id-merge
0,12339,138.623877,14.895859,AT2020-W15
1,58488,657.089984,1.169471,AT2020-W16
2,33443,375.719128,1.339593,AT2020-W17
3,26598,298.818209,1.173021,AT2020-W18
4,42153,473.572598,0.626290,AT2020-W19
...,...,...,...,...
11750,123920,1199.892831,5.725468,SE2021-W46
11751,226289,2191.111594,5.265833,SE2021-W47
11752,273987,2652.961887,5.037465,SE2021-W48
11753,335956,3252.995447,5.554001,SE2021-W49


### 12. Limpieza DataSet `vaccine_tracker.csv`

In [42]:
#url_vaccine_tracker = "https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv"
#df_ex2 = pd.read_csv(url_vaccine_tracker)
df_ex2 = pd.read_csv('data_extra/vaccine_tracker.csv')


In [43]:
df_ex2['TargetGroup'].unique()

array(['ALL', 'Age0_4', 'Age10_14', 'Age15_17', 'Age18_24', 'Age25_49',
       'Age50_59', 'Age5_9', 'Age60_69', 'Age70_79', 'Age80+', 'Age<18',
       'AgeUNK', 'HCW', 'LTCF', '1_Age60+', '1_Age<60'], dtype=object)

In [44]:
filter_target = df_ex2['TargetGroup'] != '1_Age<60'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != '1_Age60+'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'LTCF'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'HCW'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'AgeUNK'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'Age<18'
df_ex2 = df_ex2[filter_target]

filter_target = df_ex2['TargetGroup'] != 'ALL'
df_ex2 = df_ex2[filter_target]


df_ex2['TargetGroup'].unique()

array(['Age0_4', 'Age10_14', 'Age15_17', 'Age18_24', 'Age25_49',
       'Age50_59', 'Age5_9', 'Age60_69', 'Age70_79', 'Age80+'],
      dtype=object)

In [45]:
df_ex2 = df_ex2.drop(['Denominator', 'DoseAdditional1', 'UnknownDose','Population','Vaccine'], axis=1)

df_ex2 = df_ex2.drop(['TargetGroup'], axis=1)

df_ex2['id-merge'] = df_ex2['ReportingCountry'] + df_ex2['YearWeekISO']


df_ex2 = df_ex2.drop(['YearWeekISO', 'ReportingCountry', 'Region'], axis=1)

df_ex2 = df_ex2.groupby(['id-merge']).sum().reset_index()

df_ex2

Unnamed: 0,id-merge,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,AT2020-W53,614250.0,0.0,5249,0.0,0
1,AT2021-W01,614250.0,0.0,26205,0.0,0
2,AT2021-W02,686250.0,0.0,85006,0.0,399
3,AT2021-W03,585000.0,0.0,93304,0.0,4572
4,AT2021-W04,549900.0,0.0,31525,0.0,17538
...,...,...,...,...,...,...
1378,SK2021-W47,0.0,0.0,53204,0.0,8188
1379,SK2021-W48,0.0,0.0,28858,0.0,8425
1380,SK2021-W49,0.0,0.0,22157,0.0,15976
1381,SK2021-W50,0.0,0.0,18650,0.0,23346


### 13. Enriquecimiento de `df` con DataSet `test_rate.csv`, `vaccine_tracker.csv` y `ICU_hospital.csv`

In [56]:
df.columns

Index(['Date', 'Country', 'Lat', 'Long', 'Year', 'Month', 'Week', 'Day',
       'Population', 'Confirmed', 'Deaths', 'Recovered', 'Year-Week',
       'tests_done', 'testing_rate', 'positivity_rate', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose',
       'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

In [47]:
df = pd.merge(df, df_ex1 , how='left', on='id-merge')
df = pd.merge(df, df_ex2 , how='left', on='id-merge')
df = pd.merge(df, df_ex3 , how='left', on='id-merge-country-date')

In [58]:
df.columns

Index(['Date', 'Country', 'Lat', 'Long', 'Year', 'Month', 'Week', 'Day',
       'Population', 'Confirmed', 'Deaths', 'Recovered', 'Year-Week',
       'tests_done', 'testing_rate', 'positivity_rate', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose',
       'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

### 14. Preparacion dataframe final

In [50]:
df = df.drop(['id-merge', 'Year-Week-Copy', 'Week-Copy', 'geoId','id-merge-country-date'], axis=1)

df = df.rename(columns={'NumberDosesReceived':'DosesReceived',
                        'NumberDosesExported':'DosesExported',
                        'DateCopy':'Date'})

df.columns

Index(['Date', 'Country', 'Lat', 'Long', 'Year', 'Month', 'Week', 'Day',
       'Population', 'Confirmed', 'Deaths', 'Recovered', 'Year-Week',
       'tests_done', 'testing_rate', 'positivity_rate', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose',
       'Hospital_Occupancy', 'ICU_Occupancy'],
      dtype='object')

### 15. Anadiendo columnas calculadas al df final

In [51]:
""" 
df['n_test'] = df['testing_rate'] * 100_000
df['%_FirstDose'] = (df['FirstDose'] / df['Population']) *100
df['%_SecondDose'] = (df['SecondDose'] / df['Population']) *100
df['%_FirstDoseRefused'] = (df['FirstDoseRefused'] / df['Population']) *100
"""

" \ndf['n_test'] = df['testing_rate'] * 100_000\ndf['%_FirstDose'] = (df['FirstDose'] / df['Population']) *100\ndf['%_SecondDose'] = (df['SecondDose'] / df['Population']) *100\ndf['%_FirstDoseRefused'] = (df['FirstDoseRefused'] / df['Population']) *100\n"

In [52]:
df = df.sort_values(['Date'], ascending=[True])
#filter_Confirmed_0 = df['Confirmed'] != 0
#df = df[filter_Confirmed_0]
df

Unnamed: 0,Date,Country,Lat,Long,Year,Month,Week,Day,Population,Confirmed,...,tests_done,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,SecondDose,Hospital_Occupancy,ICU_Occupancy
0,2020-01-22,Austria,48.200000,16.366667,2020,1,4,22,8901064,0,...,,,,,,,,,,
27,2020-01-22,Sweden,59.333333,18.050000,2020,1,4,22,10327589,0,...,11.0,0.106511,0.0,,,,,,,
26,2020-01-22,Spain,40.400000,-3.683333,2020,1,4,22,47332614,0,...,,,,,,,,,,
25,2020-01-22,Slovenia,46.050000,14.516667,2020,1,4,22,2095861,0,...,,,,,,,,,,
24,2020-01-22,Slovakia,48.150000,17.116667,2020,1,4,22,5457873,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33140,2021-12-28,Belgium,50.833333,4.333333,2021,12,52,28,11522440,2062836,...,,,,,,,,,,
33139,2021-12-28,Austria,48.200000,16.366667,2021,12,52,28,8901064,1268519,...,,,,,,,,,,
33165,2021-12-28,Spain,40.400000,-3.683333,2021,12,52,28,47332614,6032297,...,,,,,,,,,,
33151,2021-12-28,Ireland,53.316667,-6.233333,2021,12,52,28,4964440,731467,...,,,,,,,,,,


In [53]:
"""
# Rellenando nulos con el anterior registro
print((df.isnull().sum()/len(df))*100)
#df.isnull().sum().sum()
df = df.fillna(method='pad')
df
"""

"\n# Rellenando nulos con el anterior registro\nprint((df.isnull().sum()/len(df))*100)\n#df.isnull().sum().sum()\ndf = df.fillna(method='pad')\ndf\n"

In [54]:

#print((df.isnull().sum()/len(df))*100)
#df.dtypes

### 16. Exporto df como `.csv`

In [55]:
#df.to_csv('df.csv')