# Data Cleaning

### Impotando Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re

### 1. Limpieza DataSet `confirmed_global.csv`

In [2]:
df1 = pd.read_csv("data_core/confirmed_global.csv")

#### 1.1. El analisis se va a hacer por pais, no por provincia de modo que elimino la columna `Province/State`. Las columnas de `Lat` y `Long` se van a eliminar ahora para luego mergearlas con el dataframe final, ya que las coordenadas se cerian alteradas en el `groupby`.

In [3]:
df1 = df1.drop(['Province/State'], axis=1)
df1 = df1.drop(['Lat'], axis=1)
df1 = df1.drop(['Long'], axis=1)

#### 1.2. Una vez eliminada dichas columnas agrupamos los datos a nivel de fila por pais Sumando asi todos los casos por pais que anteriormente estaban subdivididos por `'Province/State'`.

In [4]:
# Comprobamos que efectivamente, hay nombres de paises que aparecen varias veces
print(df1["Country/Region"].value_counts().to_string())

China                               33
Canada                              16
France                              12
United Kingdom                      12
Australia                            8
Netherlands                          5
Denmark                              3
Panama                               1
Nicaragua                            1
Niger                                1
Nigeria                              1
North Macedonia                      1
Norway                               1
Oman                                 1
Pakistan                             1
Afghanistan                          1
Papua New Guinea                     1
Paraguay                             1
Peru                                 1
Philippines                          1
Poland                               1
Portugal                             1
Qatar                                1
Romania                              1
Russia                               1
New Zealand              

In [5]:
df1.loc[df1["Country/Region"] == "Australia"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
8,Australia,0,0,0,0,0,0,0,0,0,...,123,123,123,123,123,123,123,123,123,123
9,Australia,0,0,0,0,3,4,4,4,4,...,5296,5299,5300,5303,5310,5316,5318,5320,5324,5330
10,Australia,0,0,0,0,0,0,0,0,0,...,109,111,112,112,112,112,112,112,112,112
11,Australia,0,0,0,0,0,0,0,1,3,...,1485,1488,1489,1492,1491,1497,1500,1501,1502,1502
12,Australia,0,0,0,0,0,0,0,0,0,...,658,658,659,661,661,662,663,665,665,666
13,Australia,0,0,0,0,0,0,0,0,0,...,234,234,234,234,234,234,234,234,234,234
14,Australia,0,0,0,0,1,1,1,1,2,...,20484,20484,20484,20484,20484,20484,20484,20484,20485,20485
15,Australia,0,0,0,0,0,0,0,0,0,...,944,944,947,948,950,951,951,951,951,953


In [6]:
# Vemos que tras el groupby los casos de agrupado correctamente, ya que la suma de la columna de casos de un dia especifico
# es igual a la fila de ese mismo dia para df1 tras esta operacion
df1 = df1.groupby(['Country/Region']).sum().reset_index()
print(df1.loc[df1["Country/Region"] == "Australia"].sum())

Country/Region    Australia
1/22/20                   0
1/23/20                   0
1/24/20                   0
1/25/20                   0
                    ...    
4/6/21                29379
4/7/21                29385
4/8/21                29390
4/9/21                29396
4/10/21               29405
Length: 446, dtype: object


In [7]:
# Vemos que solo existe un valor por pais. 
print(list(df1["Country/Region"].value_counts()),print(df1["Country/Region"].value_counts().to_string()))

Afghanistan                         1
Albania                             1
Namibia                             1
Nepal                               1
Netherlands                         1
New Zealand                         1
Nicaragua                           1
Niger                               1
Nigeria                             1
North Macedonia                     1
Norway                              1
Oman                                1
Pakistan                            1
Panama                              1
Papua New Guinea                    1
Paraguay                            1
Peru                                1
Philippines                         1
Poland                              1
Portugal                            1
Qatar                               1
Romania                             1
Russia                              1
Mozambique                          1
Morocco                             1
Montenegro                          1
Malaysia    

**Confirmamos que el groupby se ha completado con exito.** 

#### 1.3. Mergeamos las columnas de fecha por cada pais y anadimos una columna con su valor correspondiente

In [8]:
# Agrupo las columnas de fecha en filas utilizando la funcion `melt` y hago un idetificador unico para mergear con el resto
# de tablas, que sera el (dia)+(el nombre del pais) para poder mergear correctamente con el resto de tablas por dia y pais
df1 = df1.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Confirmed")

# Creo la columna con el identificador para usarla como indentificador unico para el mergeo
df1['Date-Country'] = df1['Date'] + df1['Country/Region']

# Hago esta misma columna indice del dataframe
df1.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Confirmed
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2692
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,265897
4/10/21Yemen,Yemen,4/10/21,5276
4/10/21Zambia,Zambia,4/10/21,89918


### 2. Limpieza DataSet `deaths_global.csv`

- 2.1. Repetimos el mismo proceso anterior para el dataset `deaths_global.csv`

In [9]:
df2 = pd.read_csv("data_core/deaths_global.csv")
df2 = df2.drop(['Province/State'], axis=1)
df2 = df2.drop(['Lat'], axis=1)
df2 = df2.drop(['Long'], axis=1)
df2 = df2.groupby(['Country/Region']).sum().reset_index()
df2 = df2.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Deaths")
df2['Date-Country'] = df2['Date'] + df2['Country/Region']

df2.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Deaths
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,35
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,2838
4/10/21Yemen,Yemen,4/10/21,1031
4/10/21Zambia,Zambia,4/10/21,1226


### 3. Limpieza DataSet `recovered_global.csv`

- 3.1. Repetimos el mismo proceso anterior para el dataset `recovered_global.csv`

In [10]:
df3 = pd.read_csv("data_core/recovered_global.csv")
df3 = df3.drop(['Province/State'], axis=1)
df3 = df3.drop(['Lat'], axis=1)
df3 = df3.drop(['Long'], axis=1)
df3 = df3.groupby(['Country/Region']).sum().reset_index()
df3 = df3.melt(id_vars=["Country/Region"], 
        var_name="Date", 
        value_name="Recovered")
df3['Date-Country'] = df3['Date'] + df3['Country/Region']
df3.set_index('Date-Country')

Unnamed: 0_level_0,Country/Region,Date,Recovered
Date-Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/22/20Afghanistan,Afghanistan,1/22/20,0
1/22/20Albania,Albania,1/22/20,0
1/22/20Algeria,Algeria,1/22/20,0
1/22/20Andorra,Andorra,1/22/20,0
1/22/20Angola,Angola,1/22/20,0
...,...,...,...
4/10/21Vietnam,Vietnam,4/10/21,2429
4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,231288
4/10/21Yemen,Yemen,4/10/21,2027
4/10/21Zambia,Zambia,4/10/21,86813


### 4. Juntamos todos los dataset (df1, df2 y df3) en uno solo (df) por el indice Date

In [11]:
# Creo un primer dataframe final (df_f1), mergeando df1 y df2 por 'Date-Country'
df = pd.merge(df1, df2 , how='left', on='Date-Country')

# Creo un sefundo dataframe final, mergeando el anterior dataframe (df_f1) y df3 por 'Date-Country'
df = pd.merge(df, df3 , how='left', on='Date-Country')

df


Unnamed: 0,Country/Region_x,Date_x,Confirmed,Date-Country,Country/Region_y,Date_y,Deaths,Country/Region,Date,Recovered
0,Afghanistan,1/22/20,0,1/22/20Afghanistan,Afghanistan,1/22/20,0,Afghanistan,1/22/20,0
1,Albania,1/22/20,0,1/22/20Albania,Albania,1/22/20,0,Albania,1/22/20,0
2,Algeria,1/22/20,0,1/22/20Algeria,Algeria,1/22/20,0,Algeria,1/22/20,0
3,Andorra,1/22/20,0,1/22/20Andorra,Andorra,1/22/20,0,Andorra,1/22/20,0
4,Angola,1/22/20,0,1/22/20Angola,Angola,1/22/20,0,Angola,1/22/20,0
...,...,...,...,...,...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,4/10/21Vietnam,Vietnam,4/10/21,35,Vietnam,4/10/21,2429
85436,West Bank and Gaza,4/10/21,265897,4/10/21West Bank and Gaza,West Bank and Gaza,4/10/21,2838,West Bank and Gaza,4/10/21,231288
85437,Yemen,4/10/21,5276,4/10/21Yemen,Yemen,4/10/21,1031,Yemen,4/10/21,2027
85438,Zambia,4/10/21,89918,4/10/21Zambia,Zambia,4/10/21,1226,Zambia,4/10/21,86813


In [12]:
#Elimino las columnas duplicadas
df = df.drop(['Date-Country','Country/Region_y','Date_y', 'Country/Region_x','Date_x'], axis=1)

# Reordeno las Columnas
df = df[['Country/Region','Date','Confirmed','Deaths','Recovered']]
df = df.rename(columns={'Country/Region':'Country'})
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
0,Afghanistan,1/22/20,0,0,0
1,Albania,1/22/20,0,0,0
2,Algeria,1/22/20,0,0,0
3,Andorra,1/22/20,0,0,0
4,Angola,1/22/20,0,0,0
...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,35,2429
85436,West Bank and Gaza,4/10/21,265897,2838,231288
85437,Yemen,4/10/21,5276,1031,2027
85438,Zambia,4/10/21,89918,1226,86813


In [13]:
#print(list(df["Country"].value_counts()),print(df["Country"].value_counts().to_string()),len(list(df["Country"].value_counts())))

### 5. Anado las columnas de "Lat" y "Long" al dataframe df

In [14]:
df4 = pd.read_csv("data_extra/concap.csv")
df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)
df4 = df4.drop_duplicates()

df4 = df4.rename(columns={'CountryName':'Country',
                          'CapitalLatitude':'Lat', 
                          'CapitalLongitude':'Long', 
                          'CountryCode':'geoId'})
df4


  df4.drop(df4.columns.difference(['CountryName','CapitalLatitude','CapitalLongitude','CountryCode']), 1, inplace=True)


Unnamed: 0,Country,Lat,Long,geoId
0,Somaliland,9.550000,44.050000,
1,South Georgia and South Sandwich Islands,-54.283333,-36.500000,GS
2,French Southern and Antarctic Lands,-49.350000,70.216667,TF
3,Palestine,31.766667,35.233333,PS
4,Aland Islands,60.116667,19.900000,AX
...,...,...,...,...
240,Northern Cyprus,35.183333,33.366667,
241,Hong Kong,0.000000,0.000000,HK
242,Heard Island and McDonald Islands,0.000000,0.000000,HM
243,British Indian Ocean Territory,-7.300000,72.400000,IO


In [15]:
df = pd.merge(df, df4 , how='left', on='Country')
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId
0,Afghanistan,1/22/20,0,0,0,34.516667,69.183333,AF
1,Albania,1/22/20,0,0,0,41.316667,19.816667,AL
2,Algeria,1/22/20,0,0,0,36.750000,3.050000,DZ
3,Andorra,1/22/20,0,0,0,42.500000,1.516667,AD
4,Angola,1/22/20,0,0,0,-8.833333,13.216667,AO
...,...,...,...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,35,2429,21.033333,105.850000,VN
85436,West Bank and Gaza,4/10/21,265897,2838,231288,,,
85437,Yemen,4/10/21,5276,1031,2027,15.350000,44.200000,YE
85438,Zambia,4/10/21,89918,1226,86813,-15.416667,28.283333,ZM


In [16]:
#print(list(df["Country"].value_counts()),print(df["Country"].value_counts().to_string()),len(list(df["Country"].value_counts())))

### 6. Anado las columnas de informacion geografica de los paises al dataframe (df)

In [17]:
df5 = pd.read_csv("data_extra/Cases-Death-Country-TimeSeries.csv")
df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)
df5 = df5.drop_duplicates()
df5

  df5.drop(df5.columns.difference(['geoId','popData2020', 'continentExp']), 1, inplace=True)


Unnamed: 0,geoId,popData2020,continentExp
0,AT,8901064,Europe
297,BE,11522440,Europe
594,BG,6951482,Europe
891,HR,4058165,Europe
1188,CY,888005,Europe
1485,CZ,10693939,Europe
1782,DK,5822763,Europe
2079,EE,1328976,Europe
2376,FI,5525292,Europe
2673,FR,67320216,Europe


In [18]:
df = pd.merge(df, df5 , how='left', on='geoId')
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp
0,Afghanistan,1/22/20,0,0,0,34.516667,69.183333,AF,,
1,Albania,1/22/20,0,0,0,41.316667,19.816667,AL,,
2,Algeria,1/22/20,0,0,0,36.750000,3.050000,DZ,,
3,Andorra,1/22/20,0,0,0,42.500000,1.516667,AD,,
4,Angola,1/22/20,0,0,0,-8.833333,13.216667,AO,,
...,...,...,...,...,...,...,...,...,...,...
85435,Vietnam,4/10/21,2692,35,2429,21.033333,105.850000,VN,,
85436,West Bank and Gaza,4/10/21,265897,2838,231288,,,,,
85437,Yemen,4/10/21,5276,1031,2027,15.350000,44.200000,YE,,
85438,Zambia,4/10/21,89918,1226,86813,-15.416667,28.283333,ZM,,


### 7. Filtro todo el dataframe por continentExP: "Europe"

Para poder aprovechar los dataset de data_extra, que estan centrados unicamente en Europa, y ademas poder centrar mejor el analisis, voy a filtrar el dataframe eliminando todos los paises que no son europeos.

- 7.1. Filtro el dataframe (df) para paises europeos 

In [19]:
filter_europe = df['continentExp'] == 'Europe'

df = df[filter_europe]
df

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp
9,Austria,1/22/20,0,0,0,48.200000,16.366667,AT,8901064.0,Europe
16,Belgium,1/22/20,0,0,0,50.833333,4.333333,BE,11522440.0,Europe
25,Bulgaria,1/22/20,0,0,0,42.683333,23.316667,BG,6951482.0,Europe
43,Croatia,1/22/20,0,0,0,45.800000,16.000000,HR,4058165.0,Europe
45,Cyprus,1/22/20,0,0,0,35.166667,33.366667,CY,888005.0,Europe
...,...,...,...,...,...,...,...,...,...,...
85389,Romania,4/10/21,1002865,25006,902239,44.433333,26.100000,RO,19328838.0,Europe
85404,Slovakia,4/10/21,370473,10487,255300,48.150000,17.116667,SK,5457873.0,Europe
85405,Slovenia,4/10/21,225950,4112,207927,46.050000,14.516667,SI,2095861.0,Europe
85410,Spain,4/10/21,3347512,76328,150376,40.400000,-3.683333,ES,47332614.0,Europe


### 8. Cambio de tipo de datos e indice

In [20]:
#df.dtypes

In [21]:
# Cabia Date a tipo fecha
df['Date'] = pd.to_datetime(df.Date)
df['Date-Copy'] = df['Date']
df.dtypes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df.Date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date-Copy'] = df['Date']


Country                 object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
Recovered                int64
Lat                    float64
Long                   float64
geoId                   object
popData2020            float64
continentExp            object
Date-Copy       datetime64[ns]
dtype: object

In [22]:
# Utilizo la columna Date como indice
df.set_index('Date', inplace=True)

### 9. Extraccion de nuevas columnas a traves del indice `Date`

In [23]:
def atributos_fecha (data):
    data['Year'] = data.index.year
    data['Month'] = data.index.month
    data['Week'] = data.index.week
    data['Week-Copy'] = data.index.week
    data['Day'] = data.index.month
    return(data)

In [24]:
atributos_fecha(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Year'] = data.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Month'] = data.index.month
  data['Week'] = data.index.week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Week'] = data.index.week
  data['Week-Copy'] = data.index.week
A value is trying to be set on a copy o

Unnamed: 0_level_0,Country,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp,Date-Copy,Year,Month,Week,Week-Copy,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-22,Austria,0,0,0,48.200000,16.366667,AT,8901064.0,Europe,2020-01-22,2020,1,4,4,1
2020-01-22,Belgium,0,0,0,50.833333,4.333333,BE,11522440.0,Europe,2020-01-22,2020,1,4,4,1
2020-01-22,Bulgaria,0,0,0,42.683333,23.316667,BG,6951482.0,Europe,2020-01-22,2020,1,4,4,1
2020-01-22,Croatia,0,0,0,45.800000,16.000000,HR,4058165.0,Europe,2020-01-22,2020,1,4,4,1
2020-01-22,Cyprus,0,0,0,35.166667,33.366667,CY,888005.0,Europe,2020-01-22,2020,1,4,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-10,Romania,1002865,25006,902239,44.433333,26.100000,RO,19328838.0,Europe,2021-04-10,2021,4,14,14,4
2021-04-10,Slovakia,370473,10487,255300,48.150000,17.116667,SK,5457873.0,Europe,2021-04-10,2021,4,14,14,4
2021-04-10,Slovenia,225950,4112,207927,46.050000,14.516667,SI,2095861.0,Europe,2021-04-10,2021,4,14,14,4
2021-04-10,Spain,3347512,76328,150376,40.400000,-3.683333,ES,47332614.0,Europe,2021-04-10,2021,4,14,14,4


In [25]:
df['Year'] = df['Year'].astype({"Year": str})
df['Week'] = df['Week'].astype({"Week": str})
df['Week-Copy'] = df['Week'].astype({"Week": str})
df['Day'] = df['Day'].astype({"Day": str})
df['Month'] = df['Month'].astype({"Month": str})
df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Year'].astype({"Year": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Week-Copy'] = df['Week'].astype({"Week": str})
A value is trying to be set on a copy of a slice from a DataFr

Country                 object
Confirmed                int64
Deaths                   int64
Recovered                int64
Lat                    float64
Long                   float64
geoId                   object
popData2020            float64
continentExp            object
Date-Copy       datetime64[ns]
Year                    object
Month                   object
Week                    object
Week-Copy               object
Day                     object
dtype: object

In [26]:

df["Week-Copy"] = df["Week-Copy"].replace('1','01')
df["Week-Copy"] = df["Week-Copy"].replace('2','02')
df["Week-Copy"] = df["Week-Copy"].replace('3','03')
df["Week-Copy"] = df["Week-Copy"].replace('4','04')
df["Week-Copy"] = df["Week-Copy"].replace('6','06')
df["Week-Copy"] = df["Week-Copy"].replace('7','08')
df["Week-Copy"] = df["Week-Copy"].replace('8','08')
df["Week-Copy"] = df["Week-Copy"].replace('9','09')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('1','01')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('2','02')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Week-Copy"] = df["Week-Copy"].replace('3','03')
A value is trying to be set on a copy of a slice 

In [27]:
df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week"] = df["Year"] + "-" + df["Week-Copy"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year-Week-Copy"] = df["Year"] + "-W" + df["Week-Copy"]


Unnamed: 0_level_0,Country,Confirmed,Deaths,Recovered,Lat,Long,geoId,popData2020,continentExp,Date-Copy,Year,Month,Week,Week-Copy,Day,Year-Week,Year-Week-Copy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-01-22,Austria,0,0,0,48.200000,16.366667,AT,8901064.0,Europe,2020-01-22,2020,1,4,04,1,2020-04,2020-W04
2020-01-22,Belgium,0,0,0,50.833333,4.333333,BE,11522440.0,Europe,2020-01-22,2020,1,4,04,1,2020-04,2020-W04
2020-01-22,Bulgaria,0,0,0,42.683333,23.316667,BG,6951482.0,Europe,2020-01-22,2020,1,4,04,1,2020-04,2020-W04
2020-01-22,Croatia,0,0,0,45.800000,16.000000,HR,4058165.0,Europe,2020-01-22,2020,1,4,04,1,2020-04,2020-W04
2020-01-22,Cyprus,0,0,0,35.166667,33.366667,CY,888005.0,Europe,2020-01-22,2020,1,4,04,1,2020-04,2020-W04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-10,Romania,1002865,25006,902239,44.433333,26.100000,RO,19328838.0,Europe,2021-04-10,2021,4,14,14,4,2021-14,2021-W14
2021-04-10,Slovakia,370473,10487,255300,48.150000,17.116667,SK,5457873.0,Europe,2021-04-10,2021,4,14,14,4,2021-14,2021-W14
2021-04-10,Slovenia,225950,4112,207927,46.050000,14.516667,SI,2095861.0,Europe,2021-04-10,2021,4,14,14,4,2021-14,2021-W14
2021-04-10,Spain,3347512,76328,150376,40.400000,-3.683333,ES,47332614.0,Europe,2021-04-10,2021,4,14,14,4,2021-14,2021-W14


### 10. Preaparo el dataframe (df) para el mergeo con `data_extra`. Ordeno las columnas y elimino las columnas sobrantes.  

In [28]:
print(df.columns)
df = df.drop(['continentExp'], axis=1)

# Reordeno las Columnas
df = df[['Date-Copy','Country','geoId','Lat','Long','Year','Month','Week','Day','popData2020','Confirmed','Deaths',
         'Recovered','Week-Copy','Year-Week','Year-Week-Copy']]

df = df.rename(columns={'popData2020':'Population'})
df['Population'] = df['Population'].astype(int)
df['id-merge'] = df['geoId']+df['Year-Week-Copy']

#df.dtypes
df

Index(['Country', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long', 'geoId',
       'popData2020', 'continentExp', 'Date-Copy', 'Year', 'Month', 'Week',
       'Week-Copy', 'Day', 'Year-Week', 'Year-Week-Copy'],
      dtype='object')


Unnamed: 0_level_0,Date-Copy,Country,geoId,Lat,Long,Year,Month,Week,Day,Population,Confirmed,Deaths,Recovered,Week-Copy,Year-Week,Year-Week-Copy,id-merge
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-01-22,2020-01-22,Austria,AT,48.200000,16.366667,2020,1,4,1,8901064,0,0,0,04,2020-04,2020-W04,AT2020-W04
2020-01-22,2020-01-22,Belgium,BE,50.833333,4.333333,2020,1,4,1,11522440,0,0,0,04,2020-04,2020-W04,BE2020-W04
2020-01-22,2020-01-22,Bulgaria,BG,42.683333,23.316667,2020,1,4,1,6951482,0,0,0,04,2020-04,2020-W04,BG2020-W04
2020-01-22,2020-01-22,Croatia,HR,45.800000,16.000000,2020,1,4,1,4058165,0,0,0,04,2020-04,2020-W04,HR2020-W04
2020-01-22,2020-01-22,Cyprus,CY,35.166667,33.366667,2020,1,4,1,888005,0,0,0,04,2020-04,2020-W04,CY2020-W04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-10,2021-04-10,Romania,RO,44.433333,26.100000,2021,4,14,4,19328838,1002865,25006,902239,14,2021-14,2021-W14,RO2021-W14
2021-04-10,2021-04-10,Slovakia,SK,48.150000,17.116667,2021,4,14,4,5457873,370473,10487,255300,14,2021-14,2021-W14,SK2021-W14
2021-04-10,2021-04-10,Slovenia,SI,46.050000,14.516667,2021,4,14,4,2095861,225950,4112,207927,14,2021-14,2021-W14,SI2021-W14
2021-04-10,2021-04-10,Spain,ES,40.400000,-3.683333,2021,4,14,4,47332614,3347512,76328,150376,14,2021-14,2021-W14,ES2021-W14


### 11. Limpieza DataSet `test_rate.csv`

In [29]:
df_ex1 = pd.read_csv("data_extra/test_rate.csv")

- 11.1. Dentro del dataset exiten datos a nivel nacional y por comunidades. Vamos a filtrar por nacional, para descartar todas las lineas por provicia, ya que nuestro analisis es a nivel nacional en Europa. 

In [30]:
filter_national = df_ex1['level'] == 'national'

df_ex1 = df_ex1[filter_national]

- 11.2. Eliminamos columnas no necesarias

In [31]:
df_ex1 = df_ex1.drop(['region_name', 'new_cases', 'testing_data_source','population','region_name','level','region'], axis=1)


In [32]:
df_ex1['id-merge'] = df_ex1['country_code'] + df_ex1['year_week']
df_ex1 = df_ex1.drop(['year_week', 'country_code', 'country'], axis=1)
df_ex1


Unnamed: 0,tests_done,testing_rate,positivity_rate,id-merge
0,12339,138.623877,14.895859,AT2020-W15
1,58488,657.089984,1.169471,AT2020-W16
2,33443,375.719128,1.339593,AT2020-W17
3,26598,298.818209,1.173021,AT2020-W18
4,42153,473.572598,0.626290,AT2020-W19
...,...,...,...,...
11750,123920,1199.892831,5.725468,SE2021-W46
11751,226289,2191.111594,5.265833,SE2021-W47
11752,273987,2652.961887,5.037465,SE2021-W48
11753,335956,3252.995447,5.554001,SE2021-W49


### 12. Limpieza DataSet `vaccine_tracker.csv`

In [33]:
df_ex2 = pd.read_csv("data_extra/vaccine_tracker.csv")

In [34]:
df_ex2

Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,UnknownDose,Region,TargetGroup,Vaccine,Population
0,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,MOD,8901064
1,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,JANSS,8901064
2,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,UNK,8901064
3,2020-W53,AT,8901064.0,61425.0,0.0,5243,,0,0,0,AT,ALL,COM,8901064
4,2020-W53,AT,8901064.0,0.0,0.0,0,,0,0,0,AT,ALL,AZ,8901064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197511,2021-W51,SK,391090.0,0.0,0.0,23,,8,292,0,SK,Age70_79,MOD,5457873
197512,2021-W51,SK,391090.0,0.0,0.0,163,,201,1457,0,SK,Age70_79,COM,5457873
197513,2021-W51,SK,184680.0,0.0,0.0,1,,0,0,0,SK,Age80+,JANSS,5457873
197514,2021-W51,SK,184680.0,0.0,0.0,70,,76,451,0,SK,Age80+,COM,5457873


In [35]:
df_ex2 = df_ex2.drop(['Denominator', 'DoseAdditional1', 'UnknownDose','Population','Vaccine'], axis=1)

filter_vacc = df_ex2['TargetGroup'] == 'ALL'
df_ex2 = df_ex2[filter_vacc]

df_ex2 = df_ex2.drop(['TargetGroup'], axis=1)

df_ex2['id-merge'] = df_ex2['ReportingCountry'] + df_ex2['YearWeekISO']

df_ex2 = df_ex2.drop(['YearWeekISO', 'ReportingCountry', 'Region'], axis=1)

df_ex2 = df_ex2.groupby(['id-merge']).sum().reset_index()

df_ex2

Unnamed: 0,id-merge,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,AT2020-W53,61425.0,0.0,5243,0.0,0
1,AT2021-W01,61425.0,0.0,26181,0.0,0
2,AT2021-W02,68625.0,0.0,84934,0.0,398
3,AT2021-W03,58500.0,0.0,93267,0.0,4568
4,AT2021-W04,54990.0,0.0,31517,0.0,17516
...,...,...,...,...,...,...
1548,SK2021-W47,0.0,0.0,49023,0.0,6738
1549,SK2021-W48,0.0,0.0,26133,0.0,7134
1550,SK2021-W49,0.0,0.0,20317,0.0,14250
1551,SK2021-W50,0.0,0.0,16742,0.0,21223


### 13. Enriquecimiento de `df` con DataSet `test_rate.csv` y `vaccine_tracker.csv`

In [36]:
df = pd.merge(df, df_ex1 , how='left', on='id-merge')
df = pd.merge(df, df_ex2 , how='left', on='id-merge')
df

Unnamed: 0,Date-Copy,Country,geoId,Lat,Long,Year,Month,Week,Day,Population,...,Year-Week-Copy,id-merge,tests_done,testing_rate,positivity_rate,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose
0,2020-01-22,Austria,AT,48.200000,16.366667,2020,1,4,1,8901064,...,2020-W04,AT2020-W04,,,,,,,,
1,2020-01-22,Belgium,BE,50.833333,4.333333,2020,1,4,1,11522440,...,2020-W04,BE2020-W04,,,,,,,,
2,2020-01-22,Bulgaria,BG,42.683333,23.316667,2020,1,4,1,6951482,...,2020-W04,BG2020-W04,,,,,,,,
3,2020-01-22,Croatia,HR,45.800000,16.000000,2020,1,4,1,4058165,...,2020-W04,HR2020-W04,,,,,,,,
4,2020-01-22,Cyprus,CY,35.166667,33.366667,2020,1,4,1,888005,...,2020-W04,CY2020-W04,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12455,2021-04-10,Romania,RO,44.433333,26.100000,2021,4,14,4,19328838,...,2021-W14,RO2021-W14,221578.0,1146.359652,13.577160,511290.0,0.0,176843.0,0.0,216048.0
12456,2021-04-10,Slovakia,SK,48.150000,17.116667,2021,4,14,4,5457873,...,2021-W14,SK2021-W14,55225.0,1011.841060,21.305568,127740.0,0.0,109640.0,0.0,25683.0
12457,2021-04-10,Slovenia,SI,46.050000,14.516667,2021,4,14,4,2095861,...,2021-W14,SI2021-W14,147444.0,7035.008524,4.323675,55350.0,0.0,52619.0,0.0,2897.0
12458,2021-04-10,Spain,ES,40.400000,-3.683333,2021,4,14,4,47332614,...,2021-W14,ES2021-W14,830074.0,1753.704116,7.203575,2213630.0,0.0,1763455.0,0.0,239601.0


### 14. Preparacion dataframe final para carga en Mongo DB

In [37]:
df.columns

Index(['Date-Copy', 'Country', 'geoId', 'Lat', 'Long', 'Year', 'Month', 'Week',
       'Day', 'Population', 'Confirmed', 'Deaths', 'Recovered', 'Week-Copy',
       'Year-Week', 'Year-Week-Copy', 'id-merge', 'tests_done', 'testing_rate',
       'positivity_rate', 'NumberDosesReceived', 'NumberDosesExported',
       'FirstDose', 'FirstDoseRefused', 'SecondDose'],
      dtype='object')

In [38]:
df = df.drop(['id-merge', 'Year-Week-Copy', 'Week-Copy', 'geoId'], axis=1)
df = df.rename(columns={'NumberDosesReceived':'DosesReceived',
                        'NumberDosesExported':'DosesExported',
                        'Date-Copy':'Date'})


df

Unnamed: 0,Date,Country,Lat,Long,Year,Month,Week,Day,Population,Confirmed,...,Recovered,Year-Week,tests_done,testing_rate,positivity_rate,DosesReceived,DosesExported,FirstDose,FirstDoseRefused,SecondDose
0,2020-01-22,Austria,48.200000,16.366667,2020,1,4,1,8901064,0,...,0,2020-04,,,,,,,,
1,2020-01-22,Belgium,50.833333,4.333333,2020,1,4,1,11522440,0,...,0,2020-04,,,,,,,,
2,2020-01-22,Bulgaria,42.683333,23.316667,2020,1,4,1,6951482,0,...,0,2020-04,,,,,,,,
3,2020-01-22,Croatia,45.800000,16.000000,2020,1,4,1,4058165,0,...,0,2020-04,,,,,,,,
4,2020-01-22,Cyprus,35.166667,33.366667,2020,1,4,1,888005,0,...,0,2020-04,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12455,2021-04-10,Romania,44.433333,26.100000,2021,4,14,4,19328838,1002865,...,902239,2021-14,221578.0,1146.359652,13.577160,511290.0,0.0,176843.0,0.0,216048.0
12456,2021-04-10,Slovakia,48.150000,17.116667,2021,4,14,4,5457873,370473,...,255300,2021-14,55225.0,1011.841060,21.305568,127740.0,0.0,109640.0,0.0,25683.0
12457,2021-04-10,Slovenia,46.050000,14.516667,2021,4,14,4,2095861,225950,...,207927,2021-14,147444.0,7035.008524,4.323675,55350.0,0.0,52619.0,0.0,2897.0
12458,2021-04-10,Spain,40.400000,-3.683333,2021,4,14,4,47332614,3347512,...,150376,2021-14,830074.0,1753.704116,7.203575,2213630.0,0.0,1763455.0,0.0,239601.0
