# Data Cleaning

### Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
import datetime

### URL Data [Johns Hopkins University (JHU)](https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases)

In [2]:
url_confirmed_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv"
url_deaths_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv"
url_recovered_global = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv"

### 1. Limpieza DataSet `confirmed_global.csv`

In [3]:
df1 = pd.read_csv(url_confirmed_global)

#### 1.1. El analisis se va a hacer por pais, no por provincia de modo que elimino la columna `Province/State`. Las columnas de `Lat` y `Long` se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el `groupby`.

In [4]:
df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

#### 1.2. Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos los casos por pais que anteriormente estaban subdivididos por `'Province/State'`.

In [5]:
# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               34
Canada                              16
United Kingdom                      12
France                              12
Australia                            8
Netherlands                          5
Denmark                              3
New Zealand                          2
Panama                               1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Palau                                1
Peru                                 1
Papua New Guinea                     1
Paraguay                             1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
Rwanda                   

In [6]:
df1.loc[df1["Country/Region"] == "Austria"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,12/18/21,12/19/21,12/20/21,12/21/21,12/22/21,12/23/21,12/24/21,12/25/21,12/26/21,12/27/21
16,Austria,0,0,0,0,0,0,0,0,0,...,1247399,1249641,1251433,1253961,1256230,1258377,1260751,1262836,1264553,1266103


In [7]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Austria"].sum())

Country/Region    Austria
1/22/20                 0
1/23/20                 0
1/24/20                 0
1/25/20                 0
                   ...   
12/23/21          1258377
12/24/21          1260751
12/25/21          1262836
12/26/21          1264553
12/27/21          1266103
Length: 707, dtype: object


In [8]:
# Vemos que solo existe un valor por pais. 
print(df1["Country/Region"].value_counts().to_string())

Afghanistan                         1
Namibia                             1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Palau                               1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Rwanda                              1
Nepal                               1
Mozambique                          1
Albania                             1
Morocco     

**Confirmamos que el groupby se ha completado con exito.** 

#### 1.3. Mergeamos las columnas de fecha por cada pais y anadimos una columna con su valor correspondiente

In [9]:
# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/27/21Vietnam,Vietnam,12/27/21,1666545
12/27/21West Bank and Gaza,West Bank and Gaza,12/27/21,468619
12/27/21Yemen,Yemen,12/27/21,10118
12/27/21Zambia,Zambia,12/27/21,234476


### 2. Limpieza DataSet `deaths_global.csv`

- 2.1. Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [10]:
df2 = pd.read_csv(url_deaths_global)
df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/27/21Vietnam,Vietnam,12/27/21,31418
12/27/21West Bank and Gaza,West Bank and Gaza,12/27/21,4907
12/27/21Yemen,Yemen,12/27/21,1984
12/27/21Zambia,Zambia,12/27/21,3709


### 3. Limpieza DataSet `recovered_global.csv`

- 3.1. Repetimos el mismo proceso anterior para el dataset `recovered_global.csv`

In [11]:
df3 = pd.read_csv(url_recovered_global)
df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
12/27/21Vietnam,Vietnam,12/27/21,0
12/27/21West Bank and Gaza,West Bank and Gaza,12/27/21,0
12/27/21Yemen,Yemen,12/27/21,0
12/27/21Zambia,Zambia,12/27/21,0


### 4. Juntamos todos los dataset (df1, df2 y df3) en uno solo (df) por el indice Date

In [12]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

# Creo un sefundo dataframe final, mergeando el anterior dataframe (df_f1) y df3 por 'Date-Country'
df = pd.merge(df, df3 , how='left', on='Date-Country')


In [13]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)

# Reordeno las Columnas
df = df[['Country/Region','Date','Confirmed','Deaths','Recovered']]
df = df.rename(columns={'Country/Region':'Country'})

### 5. Anado las columnas de "Lat" y "Long" al dataframe df

In [14]:
df4 = pd.read_csv("data_extra/concap.csv")
df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)
df4 = df4.drop_duplicates()

df4 = df4.rename(columns={'CountryName':'Country',
                          'CapitalLatitude':'Lat', 
                          'CapitalLongitude':'Long', 
                          'CountryCode':'geoId'})
df4


  df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)


Unnamed: 0,Country,Lat,Long,geoId
0,Somaliland,9.550000,44.050000,
1,South Georgia and South Sandwich Islands,-54.283333,-36.500000,GS
2,French Southern and Antarctic Lands,-49.350000,70.216667,TF
3,Palestine,31.766667,35.233333,PS
4,Aland Islands,60.116667,19.900000,AX
...,...,...,...,...
240,Northern Cyprus,35.183333,33.366667,
241,Hong Kong,0.000000,0.000000,HK
242,Heard Island and McDonald Islands,0.000000,0.000000,HM
243,British Indian Ocean Territory,-7.300000,72.400000,IO


In [15]:
df = pd.merge(df, df4 , how='left', on='Country')

### 6. Anado las columnas de informacion geografica de los paises al dataframe (df)

In [16]:
df5 = pd.read_csv("data_extra/Cases-Death-Country-TimeSeries.csv")
df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)
df5 = df5.drop_duplicates()

  df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)


In [17]:
df = pd.merge(df, df5 , how='left', on='geoId')
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp
0,Afghanistan,1/22/20,0,0,0,34.516667,69.183333,AF,,
1,Albania,1/22/20,0,0,0,41.316667,19.816667,AL,,
2,Algeria,1/22/20,0,0,0,36.750000,3.050000,DZ,,
3,Andorra,1/22/20,0,0,0,42.500000,1.516667,AD,,
4,Angola,1/22/20,0,0,0,-8.833333,13.216667,AO,,
...,...,...,...,...,...,...,...,...,...,...
138371,Vietnam,12/27/21,1666545,31418,0,21.033333,105.850000,VN,,
138372,West Bank and Gaza,12/27/21,468619,4907,0,,,,,
138373,Yemen,12/27/21,10118,1984,0,15.350000,44.200000,YE,,
138374,Zambia,12/27/21,234476,3709,0,-15.416667,28.283333,ZM,,


### 7. Filtro todo el dataframe por continentExP: "Europe"

Para poder aprovechar los dataset de data_extra, que estan centrados unicamente en Europa, y ademas poder centrar mejor el analisis, voy a filtrar el dataframe eliminando todos los paises que no son europeos.

- 7.1. Filtro el dataframe (df) para paises europeos 

In [18]:
filter_europe = df['continentExp'] == 'Europe'
df = df[filter_europe]

### 8. Cambio de tipo de datos e indice

In [19]:
#df.dtypes

In [20]:
# Cabia Date a tipo fecha
df['Date'] = pd.to_datetime(df.Date)
df['Date-Copy'] = df['Date']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df.Date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date-Copy'] = df['Date']


In [21]:
# Utilizo la columna Date como indice
df.set_index('Date', inplace=True)

### 9. Extraccion de nuevas columnas a traves del indice `Date`

In [22]:
'''
def atributos_fecha (data):
    data['Year'] = data.index.year
    data['Month'] = data.index.month
    data['Week'] = data.index.week
    data['Week-Copy'] = data.index.week
    data['Day'] = data.index.day
    return(data)
    '''
#atributos_fecha(df)

"\ndef atributos_fecha (data):\n    data['Year'] = data.index.year\n    data['Month'] = data.index.month\n    data['Week'] = data.index.week\n    data['Week-Copy'] = data.index.week\n    data['Day'] = data.index.day\n    return(data)\n    "

In [23]:
#df['year'] = pd.DatetimeIndex(df['Joined date']).year
df['Year'] = pd.DatetimeIndex(df['Date-Copy']).year
df['Month'] = pd.DatetimeIndex(df['Date-Copy']).month
df['Week'] = pd.DatetimeIndex(df['Date-Copy']).week
df['Week-Copy'] = pd.DatetimeIndex(df['Date-Copy']).week
df['Day'] = pd.DatetimeIndex(df['Date-Copy']).day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.DatetimeIndex(df['Date-Copy']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = pd.DatetimeIndex(df['Date-Copy']).month
  df['Week'] = pd.DatetimeIndex(df['Date-Copy']).week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week'] = pd.DatetimeIndex(df['Date-Copy']

In [24]:
df['Year'] = df['Year'].astype({"Year": str})
df['Week'] = df['Week'].astype({"Week": str})
df['Week-Copy'] = df['Week'].astype({"Week": str})
df['Day'] = df['Day'].astype({"Day": str})
df['Month'] = df['Month'].astype({"Month": str})
df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Year'].astype({"Year": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week-Copy'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFr

Country                 object
Confirmed                int64
Deaths                   int64
Recovered                int64
Lat                    float64
Long                   float64
geoId                   object
popData2020            float64
continentExp            object
Date-Copy       datetime64[ns]
Year                    object
Month                   object
Week                    object
Week-Copy               object
Day                     object
dtype: object

In [25]:
df["Week-Copy"] = df["Week-Copy"].replace('1','01')
df["Week-Copy"] = df["Week-Copy"].replace('2','02')
df["Week-Copy"] = df["Week-Copy"].replace('3','03')
df["Week-Copy"] = df["Week-Copy"].replace('4','04')
df["Week-Copy"] = df["Week-Copy"].replace('6','06')
df["Week-Copy"] = df["Week-Copy"].replace('7','08')
df["Week-Copy"] = df["Week-Copy"].replace('8','08')
df["Week-Copy"] = df["Week-Copy"].replace('9','09')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('1','01')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('2','02')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('3','03')
A value is trying to be set on a copy of a slice 

In [26]:
df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]


### 10. Preaparo el dataframe (df) para el mergeo con `data_extra`. Ordeno las columnas y elimino las columnas sobrantes.  

In [27]:
print(df.columns)
df = df.drop(['continentExp'], axis=1)

# Reordeno las Columnas
df = df[['Date-Copy','Country','geoId','Lat','Long','Year','Month','Week','Day','popData2020','Confirmed','Deaths',
         'Recovered','Week-Copy','Year-Week','Year-Week-Copy']]

df = df.rename(columns={'popData2020':'Population'})
df['Population'] = df['Population'].astype(int)
df['id-merge'] = df['geoId']+df['Year-Week-Copy']


Index(['Country', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long', 'geoId',
       'popData2020', 'continentExp', 'Date-Copy', 'Year', 'Month', 'Week',
       'Week-Copy', 'Day', 'Year-Week', 'Year-Week-Copy'],
      dtype='object')


### 11. Limpieza DataSet `test_rate.csv`

In [28]:
df_ex1 = pd.read_csv("data_extra/test_rate.csv")

- 11.1. Dentro del dataset exiten datos a nivel nacional y por comunidades. Vamos a filtrar por nacional, para descartar todas las lineas por provicia, ya que nuestro analisis es a nivel nacional en Europa. 

In [29]:
filter_national = df_ex1['level'] == 'national'

df_ex1 = df_ex1[filter_national]

- 11.2. Eliminamos columnas no necesarias

In [30]:
df_ex1 = df_ex1.drop(['region_name', 'new_cases', 'testing_data_source','population','region_name','level','region'], axis=1)


In [31]:
df_ex1['id-merge'] = df_ex1['country_code'] + df_ex1['year_week']
df_ex1 = df_ex1.drop(['year_week', 'country_code', 'country'], axis=1)


### 12. Limpieza DataSet `vaccine_tracker.csv`

In [32]:
df_ex2 = pd.read_csv("data_extra/vaccine_tracker.csv")

In [33]:
df_ex2 = df_ex2.drop(['Denominator', 'DoseAdditional1', 'UnknownDose','Population','Vaccine'], axis=1)

filter_vacc = df_ex2['TargetGroup'] == 'ALL'
df_ex2 = df_ex2[filter_vacc]

df_ex2 = df_ex2.drop(['TargetGroup'], axis=1)

df_ex2['id-merge'] = df_ex2['ReportingCountry'] + df_ex2['YearWeekISO']

df_ex2 = df_ex2.drop(['YearWeekISO', 'ReportingCountry', 'Region'], axis=1)

df_ex2 = df_ex2.groupby(['id-merge']).sum().reset_index()

df_ex2

Unnamed: 0,id-merge,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,AT2020-W53,61425.0,0.0,5243,0.0,0
1,AT2021-W01,61425.0,0.0,26181,0.0,0
2,AT2021-W02,68625.0,0.0,84934,0.0,398
3,AT2021-W03,58500.0,0.0,93267,0.0,4568
4,AT2021-W04,54990.0,0.0,31517,0.0,17516
...,...,...,...,...,...,...
1548,SK2021-W47,0.0,0.0,49023,0.0,6738
1549,SK2021-W48,0.0,0.0,26133,0.0,7134
1550,SK2021-W49,0.0,0.0,20317,0.0,14250
1551,SK2021-W50,0.0,0.0,16742,0.0,21223


### 13. Enriquecimiento de `df` con DataSet `test_rate.csv` y `vaccine_tracker.csv`

In [34]:
df = pd.merge(df, df_ex1 , how='left', on='id-merge')
df = pd.merge(df, df_ex2 , how='left', on='id-merge')
df

Unnamed: 0,Date-Copy,Country,geoId,Lat,Long,Year,Month,Week,Day,Population,...,Year-Week-Copy,id-merge,tests_done,testing_rate,positivity_rate,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,2020-01-22,Austria,AT,48.200000,16.366667,2020,1,4,22,8901064,...,2020-W04,AT2020-W04,,,,,,,,
1,2020-01-22,Belgium,BE,50.833333,4.333333,2020,1,4,22,11522440,...,2020-W04,BE2020-W04,,,,,,,,
2,2020-01-22,Bulgaria,BG,42.683333,23.316667,2020,1,4,22,6951482,...,2020-W04,BG2020-W04,,,,,,,,
3,2020-01-22,Croatia,HR,45.800000,16.000000,2020,1,4,22,4058165,...,2020-W04,HR2020-W04,,,,,,,,
4,2020-01-22,Cyprus,CY,35.166667,33.366667,2020,1,4,22,888005,...,2020-W04,CY2020-W04,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19763,2021-12-27,Romania,RO,44.433333,26.100000,2021,12,52,27,19328838,...,2021-W52,RO2021-W52,,,,,,,,
19764,2021-12-27,Slovakia,SK,48.150000,17.116667,2021,12,52,27,5457873,...,2021-W52,SK2021-W52,,,,,,,,
19765,2021-12-27,Slovenia,SI,46.050000,14.516667,2021,12,52,27,2095861,...,2021-W52,SI2021-W52,,,,,,,,
19766,2021-12-27,Spain,ES,40.400000,-3.683333,2021,12,52,27,47332614,...,2021-W52,ES2021-W52,,,,,,,,


### 14. Preparacion dataframe final

In [35]:
df.columns

Index(['Date-Copy', 'Country', 'geoId', 'Lat', 'Long', 'Year', 'Month', 'Week',
       'Day', 'Population', 'Confirmed', 'Deaths', 'Recovered', 'Week-Copy',
       'Year-Week', 'Year-Week-Copy', 'id-merge', 'tests_done', 'testing_rate',
       'positivity_rate', 'NumberDosesReceived', 'NumberDosesExported',
       'FirstDose', 'FirstDoseRefused', 'SecondDose'],
      dtype='object')

In [36]:
df = df.drop(['id-merge', 'Year-Week-Copy', 'Week-Copy', 'geoId'], axis=1)
df = df.rename(columns={'NumberDosesReceived':'DosesReceived',
                        'NumberDosesExported':'DosesExported',
                        'Date-Copy':'Date'})


df.columns

Index(['Date', 'Country', 'Lat', 'Long', 'Year', 'Month', 'Week', 'Day',
       'Population', 'Confirmed', 'Deaths', 'Recovered', 'Year-Week',
       'tests_done', 'testing_rate', 'positivity_rate', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose'],
      dtype='object')

### 15. Anadiendo columnas calculadas al df final

In [37]:

df['n_test'] = df['testing_rate'] * 100_000
df['%_FirstDose'] = (df['FirstDose'] / df['Population']) *100
df['%_SecondDose'] = (df['SecondDose'] / df['Population']) *100
df['%_FirstDoseRefused'] = (df['FirstDoseRefused'] / df['Population']) *100

In [39]:
df

Unnamed: 0,Date,Country,Lat,Long,Year,Month,Week,Day,Population,Confirmed,...,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,SecondDose,n_test,%_FirstDose,%_SecondDose,%_FirstDoseRefused
0,2020-01-22,Austria,48.200000,16.366667,2020,1,4,22,8901064,0,...,,,,,,,,,,
1,2020-01-22,Belgium,50.833333,4.333333,2020,1,4,22,11522440,0,...,,,,,,,,,,
2,2020-01-22,Bulgaria,42.683333,23.316667,2020,1,4,22,6951482,0,...,,,,,,,,,,
3,2020-01-22,Croatia,45.800000,16.000000,2020,1,4,22,4058165,0,...,,,,,,,,,,
4,2020-01-22,Cyprus,35.166667,33.366667,2020,1,4,22,888005,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19763,2021-12-27,Romania,44.433333,26.100000,2021,12,52,27,19328838,1803311,...,,,,,,,,,,
19764,2021-12-27,Slovakia,48.150000,17.116667,2021,12,52,27,5457873,1351450,...,,,,,,,,,,
19765,2021-12-27,Slovenia,46.050000,14.516667,2021,12,52,27,2095861,456755,...,,,,,,,,,,
19766,2021-12-27,Spain,40.400000,-3.683333,2021,12,52,27,47332614,5932626,...,,,,,,,,,,


In [38]:
df.to_csv('df.csv')