# Projeto de análise de dados da COVID-19 no Brasil

## DIO - Digital Innovation One

### Ronaldo Nunes

Fazendo a improtação das bibliotecas necessárias para o projeto

In [47]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import re
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

Importando os dados a serem analizados.

In [48]:
url = "https://raw.githubusercontent.com/Ronaldo-Nunes/Cursos/main/Geracao-Tech-Unimed-BH-Ciencia-Dados/predicao-dados-covid/dados/time-series-19-covid-combined.csv"
df = pd.read_csv(url, parse_dates=["Date"])
df

Unnamed: 0,Date,Country/Region,Province/State,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0
2,2020-01-24,Afghanistan,,0,0.0,0
3,2020-01-25,Afghanistan,,0,0.0,0
4,2020-01-26,Afghanistan,,0,0.0,0
...,...,...,...,...,...,...
231739,2022-04-12,Zimbabwe,,247094,0.0,5460
231740,2022-04-13,Zimbabwe,,247160,0.0,5460
231741,2022-04-14,Zimbabwe,,247208,0.0,5462
231742,2022-04-15,Zimbabwe,,247237,0.0,5462


Conferindo os tipos de dados das colunas.

In [49]:
df.dtypes

Date              datetime64[ns]
Country/Region            object
Province/State            object
Confirmed                  int64
Recovered                float64
Deaths                     int64
dtype: object

Criação de função para normalizar os nomes das colunas.

In [50]:
def corrige_colunas(col_name):
    return re.sub(r"[/| ]", "_", col_name).lower()

In [51]:
df.columns = [corrige_colunas(col) for col in df.columns]
df

Unnamed: 0,date,country_region,province_state,confirmed,recovered,deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0
2,2020-01-24,Afghanistan,,0,0.0,0
3,2020-01-25,Afghanistan,,0,0.0,0
4,2020-01-26,Afghanistan,,0,0.0,0
...,...,...,...,...,...,...
231739,2022-04-12,Zimbabwe,,247094,0.0,5460
231740,2022-04-13,Zimbabwe,,247160,0.0,5460
231741,2022-04-14,Zimbabwe,,247208,0.0,5462
231742,2022-04-15,Zimbabwe,,247237,0.0,5462


## Análises

Primeiro, veremos quais os países constantes no dataset para, daí, extrairmos os dados relativos apenas ao Brasil.

In [52]:
df.country_region.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana',
       'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark',
       'Diamond Princess', 'Djibouti', 'Dominica', 'Dominican Republic',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece',
       'Grenada', 'Gua

In [53]:
df.loc[df.country_region == "Brazil"]

Unnamed: 0,date,country_region,province_state,confirmed,recovered,deaths
25296,2020-01-22,Brazil,,0,0.0,0
25297,2020-01-23,Brazil,,0,0.0,0
25298,2020-01-24,Brazil,,0,0.0,0
25299,2020-01-25,Brazil,,0,0.0,0
25300,2020-01-26,Brazil,,0,0.0,0
...,...,...,...,...,...,...
26107,2022-04-12,Brazil,,30184286,0.0,661741
26108,2022-04-13,Brazil,,30210934,0.0,661904
26109,2022-04-14,Brazil,,30234024,0.0,662043
26110,2022-04-15,Brazil,,30247302,0.0,662154


Como não há dados de províncias (Estados), removeremos a coluna com valores nulos.

In [54]:
df.drop(columns=["province_state"], inplace=True)
df

Unnamed: 0,date,country_region,confirmed,recovered,deaths
0,2020-01-22,Afghanistan,0,0.0,0
1,2020-01-23,Afghanistan,0,0.0,0
2,2020-01-24,Afghanistan,0,0.0,0
3,2020-01-25,Afghanistan,0,0.0,0
4,2020-01-26,Afghanistan,0,0.0,0
...,...,...,...,...,...
231739,2022-04-12,Zimbabwe,247094,0.0,5460
231740,2022-04-13,Zimbabwe,247160,0.0,5460
231741,2022-04-14,Zimbabwe,247208,0.0,5462
231742,2022-04-15,Zimbabwe,247237,0.0,5462


## Casos confirmados no Brasil

In [55]:
brasil = df.loc[(df.country_region == "Brazil") & (df.confirmed > 0)]
brasil

Unnamed: 0,date,country_region,confirmed,recovered,deaths
25331,2020-02-26,Brazil,1,0.0,0
25332,2020-02-27,Brazil,1,0.0,0
25333,2020-02-28,Brazil,1,0.0,0
25334,2020-02-29,Brazil,2,0.0,0
25335,2020-03-01,Brazil,2,0.0,0
...,...,...,...,...,...
26107,2022-04-12,Brazil,30184286,0.0,661741
26108,2022-04-13,Brazil,30210934,0.0,661904
26109,2022-04-14,Brazil,30234024,0.0,662043
26110,2022-04-15,Brazil,30247302,0.0,662154


In [56]:
px.line(brasil, "date", "confirmed",
        labels={"date": "Período", "confirmed": "Número de casos confirmados"},
        title="Casos confirmados no Brasil")

## Número de novos casos por dia

In [57]:
# Implementação de função para realizar a contagem de novos casos por dia
brasil["novos_casos_dia"] = list(map(
        lambda x: 0 if x == 0 else brasil["confirmed"].iloc[x] - brasil["confirmed"].iloc[x-1],
        np.arange(brasil.shape[0])
    )
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [58]:
# Visualizando os dados no gráfico
px.line(brasil, "date", "novos_casos_dia",
        labels={"date": "Período", "novos_casos_dia": "Novos casos"},
        title="Novos casos registrados por dia")

## Mortes

In [59]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=brasil.date, y=brasil.deaths, name="Mortes", line=dict(color="red"))
)

fig.update_layout(title="Mortes por COVID-19 no Brasil", xaxis_title="Período", yaxis_title="Número de mortes")
fig.show()

## Comparação dos casos confirmados e pessoas recuperadas

In [60]:
fig = go.Figure()

# Casos confirmados
fig.add_trace(go.Scatter(x=brasil.date, y=brasil.confirmed, name="Confirmados", line=dict(color="orange")))

# Recuperados
fig.add_trace(go.Scatter(x=brasil.date, y=brasil.recovered, name="Curados", line=dict(color="green")))

fig.update_layout(title="Evolução da COVID-19 no Brasil", xaxis_title="Período", yaxis_title="Número de registros")
fig.show()

É possível vislumbrar a proximidade das curvas de infectados e pessoas recuperadas, ao menos até 04 de agosto de 2021, data do último registro para essa medida.

## Taxa de crescimento

Calculando a taxa de crescimento do COVID desde o primeiro caso.

In [61]:
def taxa_crescimento(dados, variavel, data_inicio=None, data_fim=None):
    # Se data_inicio for None, define como a primeira data disponível no dataset
    if data_inicio == None:
        data_inicio = dados.date.loc[dados[variavel] > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)

    if data_fim == None:
        data_fim = dados.date.iloc[-1]
    else:
        data_fim = pd.to_datetime(data_fim)

    # Define os valores de presente e passado
    passado = dados.loc[dados.date == data_inicio, variavel].values[0]
    presente = dados.loc[dados.date == data_fim, variavel].values[0]
    
    # Define o número de pontos no tempo q vamos avaliar
    n = (data_fim - data_inicio).days
    
    # Calcula a taxa
    taxa = (presente/passado)**(1/n) - 1

    return taxa*100

In [62]:
cresc_medio = taxa_crescimento(brasil, 'confirmed')
print(f"O crescimento médio do COVID no Brasil no período avaliado foi de {cresc_medio.round(2)}%.")

O crescimento médio do COVID no Brasil no período avaliado foi de 2.23%.


Observaremos o comportamento da **taxa de crescimento no tempo**. Para isso, vamos definir uma função para calcular a taxa de crescimento diária.

In [63]:
def taxa_crescimento_diaria(dados, variavel, data_inicio=None):
    if data_inicio == None:
        data_inicio = dados.date.loc[dados[variavel] > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)
        
    data_fim = dados.date.max()
    n = (data_fim - data_inicio).days
    taxas = list(map(
        lambda x: (dados[variavel].iloc[x] - dados[variavel].iloc[x-1]) / dados[variavel].iloc[x-1],
        range(1,n+1)
    ))
    return np.array(taxas)*100

In [64]:
taxa_dia = taxa_crescimento_diaria(brasil, 'confirmed')
taxa_dia

array([ 0.00000000e+00,  0.00000000e+00,  1.00000000e+02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+02,  0.00000000e+00,
        2.25000000e+02,  0.00000000e+00,  5.38461538e+01,  2.50000000e+01,
        2.40000000e+01,  2.25806452e+01,  3.68421053e+01,  1.90384615e+02,
        0.00000000e+00,  7.28476821e+00,  2.34567901e+01,  6.05000000e+01,
        1.58878505e+01,  6.69354839e+01,  2.76972625e+01,  2.87515763e+01,
        5.14201763e+01,  2.44501940e+01,  1.67879418e+01,  1.36626613e+01,
        1.68754894e+01,  1.44723618e+01,  1.42522681e+01,  9.01639344e+00,
        7.58928571e+00,  2.48525879e+01,  1.95732027e+01,  1.76711527e+01,
        1.25808056e+01,  1.43992933e+01,  7.43243243e+00,  9.26325247e+00,
        1.54016939e+01,  1.52201796e+01,  1.18862090e+01,  8.54521335e+00,
        5.54537122e+00,  7.06807546e+00,  5.57858688e+00,  7.81903542e+00,
        1.21051382e+01,  7.43290960e+00,  1.07050123e+01,  8.83557983e+00,
        5.44492335e+00,  

In [65]:
primeiro_dia = brasil.date.loc[brasil.confirmed > 0].min()
px.line(x=pd.date_range(primeiro_dia, brasil.date.max())[1:],
        y=taxa_dia, title='Taxa de crescimento de casos confirmados no Brasil',
        labels={'y':'Taxa de crescimento', 'x':'Período'}
)