# Limpeza dados brutos COVID

## Importação dos pacotes

In [1]:
import pandas as pd

## Leitura dos dados brutos

In [2]:
dados = pd.read_csv('../dados_brutos/caso_full.csv.gz', parse_dates=['date', 'last_available_date'])

In [3]:
dados.head()

Unnamed: 0,city,city_ibge_code,date,epidemiological_week,estimated_population,estimated_population_2019,is_last,is_repeated,last_available_confirmed,last_available_confirmed_per_100k_inhabitants,last_available_date,last_available_death_rate,last_available_deaths,order_for_place,place_type,state,new_confirmed,new_deaths
0,Rio Branco,1200401.0,2020-03-17,202012,413418.0,407319.0,False,False,3,0.72566,2020-03-17,0.0,0,1,city,AC,3,0
1,,12.0,2020-03-17,202012,894470.0,881935.0,False,False,3,0.33539,2020-03-17,0.0,0,1,state,AC,3,0
2,Rio Branco,1200401.0,2020-03-18,202012,413418.0,407319.0,False,False,3,0.72566,2020-03-18,0.0,0,2,city,AC,0,0
3,,12.0,2020-03-18,202012,894470.0,881935.0,False,False,3,0.33539,2020-03-18,0.0,0,2,state,AC,0,0
4,Rio Branco,1200401.0,2020-03-19,202012,413418.0,407319.0,False,False,4,0.96754,2020-03-19,0.0,0,3,city,AC,1,0


In [4]:
#Renomeando colunas
dados = dados.rename(columns={'new_confirmed':'casos_novos', 'new_deaths':'obitos_novos'})
dados.head()

Unnamed: 0,city,city_ibge_code,date,epidemiological_week,estimated_population,estimated_population_2019,is_last,is_repeated,last_available_confirmed,last_available_confirmed_per_100k_inhabitants,last_available_date,last_available_death_rate,last_available_deaths,order_for_place,place_type,state,casos_novos,obitos_novos
0,Rio Branco,1200401.0,2020-03-17,202012,413418.0,407319.0,False,False,3,0.72566,2020-03-17,0.0,0,1,city,AC,3,0
1,,12.0,2020-03-17,202012,894470.0,881935.0,False,False,3,0.33539,2020-03-17,0.0,0,1,state,AC,3,0
2,Rio Branco,1200401.0,2020-03-18,202012,413418.0,407319.0,False,False,3,0.72566,2020-03-18,0.0,0,2,city,AC,0,0
3,,12.0,2020-03-18,202012,894470.0,881935.0,False,False,3,0.33539,2020-03-18,0.0,0,2,state,AC,0,0
4,Rio Branco,1200401.0,2020-03-19,202012,413418.0,407319.0,False,False,4,0.96754,2020-03-19,0.0,0,3,city,AC,1,0


In [5]:
#Somando os casos novos e obitos_novos notificados pelos estados e agrupando pela data
dados_br = dados[dados['place_type'] == 'state'].groupby(['date']).agg({'casos_novos':'sum', 'obitos_novos':'sum'}).reset_index()
dados_br.head()

Unnamed: 0,date,casos_novos,obitos_novos
0,2020-02-25,1,0
1,2020-02-26,0,0
2,2020-02-27,0,0
3,2020-02-28,1,0
4,2020-02-29,0,0


In [6]:
#Criando as colunas de médias móveis de 7 dias para os casos e óbitos
dados_br['MM7_casos'] = dados_br['casos_novos'].rolling(7, center=False).mean()
dados_br['MM7_obitos'] = dados_br['obitos_novos'].rolling(7, center=False).mean()

#Criando uma coluna indicando o mês e o ano
dados_br['mes/ano'] = dados_br['date'].dt.strftime('%m-%Y')
dados_br.head()

Unnamed: 0,date,casos_novos,obitos_novos,MM7_casos,MM7_obitos,mes/ano
0,2020-02-25,1,0,,,02-2020
1,2020-02-26,0,0,,,02-2020
2,2020-02-27,0,0,,,02-2020
3,2020-02-28,1,0,,,02-2020
4,2020-02-29,0,0,,,02-2020


In [8]:
#Agrupnado os dados por mês/ano, somando os casos e óbitos e dividindo os óbitos pelos casos por mês
let_mes = dados_br.groupby('mes/ano').agg({'casos_novos':'sum', 'obitos_novos':'sum'})
let_mes['letalidade_mes'] = let_mes['obitos_novos'] / let_mes['casos_novos']
let_mes = let_mes.reset_index()
dados_br = dados_br.merge(let_mes[['mes/ano','letalidade_mes']], on='mes/ano')
dados_br.head()

Unnamed: 0,date,casos_novos,obitos_novos,MM7_casos,MM7_obitos,mes/ano,letalidade_mes
0,2020-02-25,1,0,,,02-2020,0.0
1,2020-02-26,0,0,,,02-2020,0.0
2,2020-02-27,0,0,,,02-2020,0.0
3,2020-02-28,1,0,,,02-2020,0.0
4,2020-02-29,0,0,,,02-2020,0.0


In [9]:
#Salvando os dados em um arquivo csv na pasta de dados limpos
dados_br.set_index(dados_br.columns[0]).to_csv('../dados_limpos/dados_br')