### Projeto análise de dados - Dados sobre COVID-19 no Brasil (2021)  

Esse projeto visa entender, por meio de um relatório mensal, como foi o comportamento de casos e óbitos da COVID-19 durante o ano de 2021.

In [1]:
# Importando bibliotecas que serão utilizidas
import os
import pandas as pd
import numpy as np
import os

In [2]:
# Manipulando diretorios 
PATH_DIR = os.getcwd()
DATA_DIR = os.path.join(PATH_DIR, "..", "data")
DATA_PATH = [os.path.join(DATA_DIR, name) for name in os.listdir(DATA_DIR)]

In [3]:
# Manipulando diretorios de saida
PATH_OUTPUT = os.path.join(PATH_DIR, "..", "output")
PATH_DATA_BRAZIL_OUTPUT = os.path.join(PATH_OUTPUT, "data_brazil_2021.csv")

In [4]:
# Carregando dados
df = None

for path in DATA_PATH[2:4]:
  data = pd.read_csv(path, sep=";", encoding="utf-8")
  
  if df is None:
    df = data
  else:
    df = pd.concat([df, data], ignore_index=True)

In [5]:
# Enxergando o head
df.head()

Unnamed: 0,regiao,estado,municipio,coduf,codmun,codRegiaoSaude,nomeRegiaoSaude,data,semanaEpi,populacaoTCU2019,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos,interior/metropolitana
0,Brasil,,,76,,,,2021-01-01,53,210147125.0,7700578.0,24605,195411,462,6747065.0,733959.0,
1,Brasil,,,76,,,,2021-01-02,53,210147125.0,7716405.0,15827,195725,314,6756284.0,748883.0,
2,Brasil,,,76,,,,2021-01-03,1,210147125.0,7733746.0,17341,196018,293,6769420.0,751260.0,
3,Brasil,,,76,,,,2021-01-04,1,210147125.0,7753752.0,20006,196561,543,6813008.0,724720.0,
4,Brasil,,,76,,,,2021-01-05,1,210147125.0,7810400.0,56648,197732,1171,6875230.0,681961.0,


In [6]:
# Filtrando para dados do Brasil
dfs_years_brazil = {}

df_brazil = df[df["regiao"] == "Brasil"]

In [7]:
df_brazil

Unnamed: 0,regiao,estado,municipio,coduf,codmun,codRegiaoSaude,nomeRegiaoSaude,data,semanaEpi,populacaoTCU2019,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos,interior/metropolitana
0,Brasil,,,76,,,,2021-01-01,53,210147125.0,7700578.0,24605,195411,462,6747065.0,733959.0,
1,Brasil,,,76,,,,2021-01-02,53,210147125.0,7716405.0,15827,195725,314,6756284.0,748883.0,
2,Brasil,,,76,,,,2021-01-03,1,210147125.0,7733746.0,17341,196018,293,6769420.0,751260.0,
3,Brasil,,,76,,,,2021-01-04,1,210147125.0,7753752.0,20006,196561,543,6813008.0,724720.0,
4,Brasil,,,76,,,,2021-01-05,1,210147125.0,7810400.0,56648,197732,1171,6875230.0,681961.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017218,Brasil,,,76,,,,2021-12-27,52,210147125.0,22246276.0,6840,618534,86,21557380.0,70362.0,
1017219,Brasil,,,76,,,,2021-12-28,52,210147125.0,22254706.0,8430,618705,171,21567845.0,68156.0,
1017220,Brasil,,,76,,,,2021-12-29,52,210147125.0,22263834.0,9128,618817,112,21575485.0,69532.0,
1017221,Brasil,,,76,,,,2021-12-30,52,210147125.0,22277239.0,13405,618984,167,21582700.0,75555.0,


In [8]:
# Formatando index
df_brazil = df_brazil.reset_index(drop=True)
df_brazil

Unnamed: 0,regiao,estado,municipio,coduf,codmun,codRegiaoSaude,nomeRegiaoSaude,data,semanaEpi,populacaoTCU2019,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos,interior/metropolitana
0,Brasil,,,76,,,,2021-01-01,53,210147125.0,7700578.0,24605,195411,462,6747065.0,733959.0,
1,Brasil,,,76,,,,2021-01-02,53,210147125.0,7716405.0,15827,195725,314,6756284.0,748883.0,
2,Brasil,,,76,,,,2021-01-03,1,210147125.0,7733746.0,17341,196018,293,6769420.0,751260.0,
3,Brasil,,,76,,,,2021-01-04,1,210147125.0,7753752.0,20006,196561,543,6813008.0,724720.0,
4,Brasil,,,76,,,,2021-01-05,1,210147125.0,7810400.0,56648,197732,1171,6875230.0,681961.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,Brasil,,,76,,,,2021-12-27,52,210147125.0,22246276.0,6840,618534,86,21557380.0,70362.0,
361,Brasil,,,76,,,,2021-12-28,52,210147125.0,22254706.0,8430,618705,171,21567845.0,68156.0,
362,Brasil,,,76,,,,2021-12-29,52,210147125.0,22263834.0,9128,618817,112,21575485.0,69532.0,
363,Brasil,,,76,,,,2021-12-30,52,210147125.0,22277239.0,13405,618984,167,21582700.0,75555.0,


#### Limpando e tratando dados

In [9]:
df_brazil.isnull().sum()

regiao                      0
estado                    365
municipio                 365
coduf                       0
codmun                    365
codRegiaoSaude            365
nomeRegiaoSaude           365
data                        0
semanaEpi                   0
populacaoTCU2019            0
casosAcumulado              0
casosNovos                  0
obitosAcumulado             0
obitosNovos                 0
Recuperadosnovos            0
emAcompanhamentoNovos       0
interior/metropolitana    365
dtype: int64

In [10]:
df_brazil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   regiao                  365 non-null    object 
 1   estado                  0 non-null      object 
 2   municipio               0 non-null      object 
 3   coduf                   365 non-null    int64  
 4   codmun                  0 non-null      float64
 5   codRegiaoSaude          0 non-null      float64
 6   nomeRegiaoSaude         0 non-null      object 
 7   data                    365 non-null    object 
 8   semanaEpi               365 non-null    int64  
 9   populacaoTCU2019        365 non-null    float64
 10  casosAcumulado          365 non-null    float64
 11  casosNovos              365 non-null    int64  
 12  obitosAcumulado         365 non-null    int64  
 13  obitosNovos             365 non-null    int64  
 14  Recuperadosnovos        365 non-null    fl

In [11]:
# Verificando se de fato só tem dados do Brasil e não dos seus estados
print(df_brazil["populacaoTCU2019"].unique())
print(df_brazil["municipio"].unique())
print(df_brazil["estado"].unique())


[2.10147125e+08]
[nan]
[nan]


In [12]:
# Salvando numero de populacao
population_brazil = int(df["populacaoTCU2019"].iloc[0])
population_brazil

210147125

In [13]:
# Excluindo colunas desnecessarias
df_brazil = df_brazil.drop(columns=[
  "estado", "municipio", "coduf", "codmun", "codRegiaoSaude", "nomeRegiaoSaude", "populacaoTCU2019", "interior/metropolitana", "regiao"
])
df_brazil

Unnamed: 0,data,semanaEpi,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos
0,2021-01-01,53,7700578.0,24605,195411,462,6747065.0,733959.0
1,2021-01-02,53,7716405.0,15827,195725,314,6756284.0,748883.0
2,2021-01-03,1,7733746.0,17341,196018,293,6769420.0,751260.0
3,2021-01-04,1,7753752.0,20006,196561,543,6813008.0,724720.0
4,2021-01-05,1,7810400.0,56648,197732,1171,6875230.0,681961.0
...,...,...,...,...,...,...,...,...
360,2021-12-27,52,22246276.0,6840,618534,86,21557380.0,70362.0
361,2021-12-28,52,22254706.0,8430,618705,171,21567845.0,68156.0
362,2021-12-29,52,22263834.0,9128,618817,112,21575485.0,69532.0
363,2021-12-30,52,22277239.0,13405,618984,167,21582700.0,75555.0


In [14]:
# Formatando as datas
df_brazil["data"] = pd.to_datetime(df_brazil["data"], format="%Y-%m-%d")

In [15]:
# Listando os tipos
df_brazil.dtypes

data                     datetime64[ns]
semanaEpi                         int64
casosAcumulado                  float64
casosNovos                        int64
obitosAcumulado                   int64
obitosNovos                       int64
Recuperadosnovos                float64
emAcompanhamentoNovos           float64
dtype: object

In [16]:
# Transformando recuperados, em acompanhamentos e casos acumulados em int64
df_brazil["Recuperadosnovos"] = df_brazil["Recuperadosnovos"].astype("int64")
df_brazil["emAcompanhamentoNovos"] = df_brazil["emAcompanhamentoNovos"].astype("int64")
df_brazil["casosAcumulado"] = df_brazil["casosAcumulado"].astype("int64")
df_brazil.dtypes


data                     datetime64[ns]
semanaEpi                         int64
casosAcumulado                    int64
casosNovos                        int64
obitosAcumulado                   int64
obitosNovos                       int64
Recuperadosnovos                  int64
emAcompanhamentoNovos             int64
dtype: object

In [17]:
df_brazil.drop(columns=["data", "semanaEpi"]).describe().round(2)

Unnamed: 0,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos
count,365.0,365.0,365.0,365.0,365.0,365.0
mean,17000494.83,40031.64,464251.65,1161.94,15783165.44,715522.9
std,4776357.02,28463.23,146219.69,951.46,4928408.18,386762.71
min,7700578.0,-573.0,195411.0,24.0,6747065.0,68156.0
25%,12910082.0,13957.0,328206.0,389.0,11239099.0,359523.0
50%,18687469.0,34339.0,521952.0,962.0,16931272.0,791923.0
75%,21445651.0,64025.0,597255.0,1641.0,20432643.0,1076873.0
max,22287521.0,150106.0,619056.0,4249.0,21584402.0,1317658.0


#### Exportação

In [18]:
# Exportação relatorio mensal do brasil
df_brazil.to_csv(PATH_DATA_BRAZIL_OUTPUT, sep=";")