# 10 MINUTES TO PANDAS

#### O script a seguir foi desenvolvido usando a documentação oficial como guia, conforme mostra o link abaixo. Muitos dos comandos fogem do padrão da documentação pois são práticas de código para o aprimoramento do conhecimento e habilidade.

## https://pandas.pydata.org/docs/user_guide/10min.html

In [52]:
import pandas as pd
import numpy as np

# CREATE OBJECTS

In [53]:
# cria uma série com tipos de dados variados
dtSe = pd.Series([34,"exa",np.nan, 13, 'trainer', 100.98])
dtSe

0         34
1        exa
2        NaN
3         13
4    trainer
5     100.98
dtype: object

In [54]:
# cria um array com períodos de datas, começando pela data 2023-01-01
dt = pd.date_range("20230101", periods=12)
dt

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12'],
              dtype='datetime64[ns]', freq='D')

In [55]:
# cria uma série a partir do array anterior
dtS = pd.Series(dt)
dtS

0    2023-01-01
1    2023-01-02
2    2023-01-03
3    2023-01-04
4    2023-01-05
5    2023-01-06
6    2023-01-07
7    2023-01-08
8    2023-01-09
9    2023-01-10
10   2023-01-11
11   2023-01-12
dtype: datetime64[ns]

In [56]:
# cria um dataframe alimentado por múmeros aleatórios, usando o array de data como indice e adicionando colunas nomeadas  
dt1 = pd.DataFrame(np.random.randn(12,4), index=dt, columns=list("ABCD"))
dt1

Unnamed: 0,A,B,C,D
2023-01-01,1.101596,-0.990928,1.256951,-1.464047
2023-01-02,0.964449,0.227069,-0.303677,0.218673
2023-01-03,2.026985,-0.046541,2.029069,-0.575669
2023-01-04,-1.853836,-0.849097,0.280463,-0.71924
2023-01-05,0.049935,-1.131711,0.286607,1.432547
2023-01-06,-0.242437,2.224492,-2.664108,-0.513019
2023-01-07,0.696262,0.027865,1.112225,-0.201584
2023-01-08,0.702086,0.131059,-1.229701,0.057626
2023-01-09,-1.376855,1.955929,0.393299,0.189131
2023-01-10,-0.102268,-1.649338,0.486665,-0.001312


In [57]:
# cria um dataframe com indices nomeados e tipos de dados variados 
dt2 = pd.DataFrame(
                        {
                            "A": 2.3,
                            "B": pd.Timestamp("20230315"),
                            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
                            "D": np.array([4] * 4, dtype="int32"),
                            "E": pd.Categorical(["uni", "duni", "tuni", "tree"]),
                            "F":"low"
                        })
dt2    

Unnamed: 0,A,B,C,D,E,F
0,2.3,2023-03-15,1.0,4,uni,low
1,2.3,2023-03-15,1.0,4,duni,low
2,2.3,2023-03-15,1.0,4,tuni,low
3,2.3,2023-03-15,1.0,4,tree,low


In [58]:
# mostra o tipo de dados do objeto referenciado
dt2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# VIEWING DATAS

#### Para praticar usei uma base de dados .csv  atualizada sobre a covid-19, a base de dados é pública e pode ser encontrada no link abaixo.

### https://data.humdata.org/dataset/coronavirus-covid-19-cases-and-deaths/resource/2ac6c3c0-76fa-4486-9ad0-9aa9e253b78d

In [59]:
# importa o arquivo .csv
data = pd.read_csv('./WHO-COVID-19-global-data.csv', low_memory=False)

# visualiza as primeiras 5 linhas do arquivo
data.head(5)

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0


In [60]:
# altera o nomde das colunas
data.columns = ['dataReported',
                'countryCode',
                'country',
                'whoRegion',
                'newCases',
                'cumulativeCases',
                'newDeaths',
                'cumulativeDeaths'
               ]

# criar um série apartir dos nomes das colunas
dataNewsColuns = pd.Series(data.columns)

# visualiza a série
dataNewsColuns

0        dataReported
1         countryCode
2             country
3           whoRegion
4            newCases
5     cumulativeCases
6           newDeaths
7    cumulativeDeaths
dtype: object

In [61]:
# visualiza os 10 primeiros registros
data.head(10)

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases,cumulativeCases,newDeaths,cumulativeDeaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0
5,2020-01-08,AF,Afghanistan,EMRO,0,0,0,0
6,2020-01-09,AF,Afghanistan,EMRO,0,0,0,0
7,2020-01-10,AF,Afghanistan,EMRO,0,0,0,0
8,2020-01-11,AF,Afghanistan,EMRO,0,0,0,0
9,2020-01-12,AF,Afghanistan,EMRO,0,0,0,0


In [62]:
# visualizar os 10 últimos registros
data.tail(10)

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases,cumulativeCases,newDeaths,cumulativeDeaths
319940,2023-09-04,ZW,Zimbabwe,AFRO,0,265737,0,5717
319941,2023-09-05,ZW,Zimbabwe,AFRO,0,265737,0,5717
319942,2023-09-06,ZW,Zimbabwe,AFRO,0,265737,0,5717
319943,2023-09-07,ZW,Zimbabwe,AFRO,0,265737,0,5717
319944,2023-09-08,ZW,Zimbabwe,AFRO,0,265737,0,5717
319945,2023-09-09,ZW,Zimbabwe,AFRO,0,265737,0,5717
319946,2023-09-10,ZW,Zimbabwe,AFRO,5,265742,1,5718
319947,2023-09-11,ZW,Zimbabwe,AFRO,0,265742,0,5718
319948,2023-09-12,ZW,Zimbabwe,AFRO,0,265742,0,5718
319949,2023-09-13,ZW,Zimbabwe,AFRO,0,265742,0,5718


In [63]:
# visualiza a quantidade de linhas do objeto
data.index

# visualiza o tipo de dados de cada coluna
data.dtypes

dataReported        object
countryCode         object
country             object
whoRegion           object
newCases             int64
cumulativeCases      int64
newDeaths            int64
cumulativeDeaths     int64
dtype: object

In [64]:
# convert o objeto para um array
data.to_numpy()

array([['2020-01-03', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ['2020-01-04', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ['2020-01-05', 'AF', 'Afghanistan', ..., 0, 0, 0],
       ...,
       ['2023-09-11', 'ZW', 'Zimbabwe', ..., 265742, 0, 5718],
       ['2023-09-12', 'ZW', 'Zimbabwe', ..., 265742, 0, 5718],
       ['2023-09-13', 'ZW', 'Zimbabwe', ..., 265742, 0, 5718]],
      dtype=object)

In [65]:
# retorna uma análise rápida do objeto
data.describe()

Unnamed: 0,newCases,cumulativeCases,newDeaths,cumulativeDeaths
count,319950.0,319950.0,319950.0,319950.0
mean,2408.387,1454035.0,21.744698,17934.23
std,39428.11,6652559.0,149.877742,75318.41
min,-65079.0,0.0,-3520.0,0.0
25%,0.0,2318.0,0.0,17.0
50%,1.0,35031.0,0.0,388.0
75%,164.0,406813.0,2.0,5708.0
max,6966046.0,103436800.0,11447.0,1127152.0


In [66]:
# transpõe de lugar as colunas e indices
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,319940,319941,319942,319943,319944,319945,319946,319947,319948,319949
dataReported,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08,2020-01-09,2020-01-10,2020-01-11,2020-01-12,...,2023-09-04,2023-09-05,2023-09-06,2023-09-07,2023-09-08,2023-09-09,2023-09-10,2023-09-11,2023-09-12,2023-09-13
countryCode,AF,AF,AF,AF,AF,AF,AF,AF,AF,AF,...,ZW,ZW,ZW,ZW,ZW,ZW,ZW,ZW,ZW,ZW
country,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
whoRegion,EMRO,EMRO,EMRO,EMRO,EMRO,EMRO,EMRO,EMRO,EMRO,EMRO,...,AFRO,AFRO,AFRO,AFRO,AFRO,AFRO,AFRO,AFRO,AFRO,AFRO
newCases,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
cumulativeCases,0,0,0,0,0,0,0,0,0,0,...,265737,265737,265737,265737,265737,265737,265742,265742,265742,265742
newDeaths,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
cumulativeDeaths,0,0,0,0,0,0,0,0,0,0,...,5717,5717,5717,5717,5717,5717,5718,5718,5718,5718


In [68]:
# reorganiza um data frame ou uma série com base nos indices ou colunas. 0 = indice | 1 = coluna, apenas 10 registros
data.sort_index(axis=0, ascending=False).head(10)

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases,cumulativeCases,newDeaths,cumulativeDeaths
319949,2023-09-13,ZW,Zimbabwe,AFRO,0,265742,0,5718
319948,2023-09-12,ZW,Zimbabwe,AFRO,0,265742,0,5718
319947,2023-09-11,ZW,Zimbabwe,AFRO,0,265742,0,5718
319946,2023-09-10,ZW,Zimbabwe,AFRO,5,265742,1,5718
319945,2023-09-09,ZW,Zimbabwe,AFRO,0,265737,0,5717
319944,2023-09-08,ZW,Zimbabwe,AFRO,0,265737,0,5717
319943,2023-09-07,ZW,Zimbabwe,AFRO,0,265737,0,5717
319942,2023-09-06,ZW,Zimbabwe,AFRO,0,265737,0,5717
319941,2023-09-05,ZW,Zimbabwe,AFRO,0,265737,0,5717
319940,2023-09-04,ZW,Zimbabwe,AFRO,0,265737,0,5717


In [81]:

data[["dataReported","country","newCases"]].sort_index(axis=0, ascending=True).head(10)

Unnamed: 0,dataReported,country,newCases
0,2020-01-03,Afghanistan,0
1,2020-01-04,Afghanistan,0
2,2020-01-05,Afghanistan,0
3,2020-01-06,Afghanistan,0
4,2020-01-07,Afghanistan,0
5,2020-01-08,Afghanistan,0
6,2020-01-09,Afghanistan,0
7,2020-01-10,Afghanistan,0
8,2020-01-11,Afghanistan,0
9,2020-01-12,Afghanistan,0


In [86]:
# classifica o dataframe por uma coluna, em ordem crescente ou decrescente
data[["country","whoRegion"]].sort_values(by="country", ascending=False).head(10)

Unnamed: 0,country,whoRegion
211948,"occupied Palestinian territory, including east...",EMRO
211052,"occupied Palestinian territory, including east...",EMRO
211044,"occupied Palestinian territory, including east...",EMRO
211045,"occupied Palestinian territory, including east...",EMRO
211046,"occupied Palestinian territory, including east...",EMRO
211047,"occupied Palestinian territory, including east...",EMRO
211048,"occupied Palestinian territory, including east...",EMRO
211049,"occupied Palestinian territory, including east...",EMRO
211050,"occupied Palestinian territory, including east...",EMRO
211051,"occupied Palestinian territory, including east...",EMRO


# SELECTION

In [88]:
# seleciona as primeiras 10 linhas, apenas duas colunas
data[["country","cumulativeCases"]].head(10)

Unnamed: 0,country,cumulativeCases
0,Afghanistan,0
1,Afghanistan,0
2,Afghanistan,0
3,Afghanistan,0
4,Afghanistan,0
5,Afghanistan,0
6,Afghanistan,0
7,Afghanistan,0
8,Afghanistan,0
9,Afghanistan,0


In [90]:
# seleciona os registros de duas colunas entre as linhas 0 e 5
data[["dataReported","newCases"]][0:6]

Unnamed: 0,dataReported,newCases
0,2020-01-03,0
1,2020-01-04,0
2,2020-01-05,0
3,2020-01-06,0
4,2020-01-07,0
5,2020-01-08,0


In [92]:
# retorna os registros, fatiando entre as linhas 100 e 105
data[100:105]

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases,cumulativeCases,newDeaths,cumulativeDeaths
100,2020-04-12,AF,Afghanistan,EMRO,37,521,0,15
101,2020-04-13,AF,Afghanistan,EMRO,34,555,3,18
102,2020-04-14,AF,Afghanistan,EMRO,52,607,1,19
103,2020-04-15,AF,Afghanistan,EMRO,58,665,3,22
104,2020-04-16,AF,Afghanistan,EMRO,56,721,3,25


In [94]:
# retorna os valores em formato de série com base no número da linha ou valor informado
data.loc[100]

dataReported         2020-04-12
countryCode                  AF
country             Afghanistan
whoRegion                  EMRO
newCases                     37
cumulativeCases             521
newDeaths                     0
cumulativeDeaths             15
Name: 100, dtype: object

In [95]:
# usando dois [[]] retorna como um dataframe
data.loc[[1320]]

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases,cumulativeCases,newDeaths,cumulativeDeaths
1320,2023-08-15,AF,Afghanistan,EMRO,61,224745,1,7942


In [100]:
# retorna os valores das linhas 12 á 21, apenas das colunas informadas
data.loc[10:20,["dataReported","newCases"]]

Unnamed: 0,dataReported,newCases
10,2020-01-13,0
11,2020-01-14,0
12,2020-01-15,0
13,2020-01-16,0
14,2020-01-17,0
15,2020-01-18,0
16,2020-01-19,0
17,2020-01-20,0
18,2020-01-21,0
19,2020-01-22,0


In [103]:
# fatia o dataframe! A 1º condição é Registros e a 2º é Coluna
# no caso abaixo retorna os registros 0-20, das colunas 0-5
data.iloc[0:20,0:5]

Unnamed: 0,dataReported,countryCode,country,whoRegion,newCases
0,2020-01-03,AF,Afghanistan,EMRO,0
1,2020-01-04,AF,Afghanistan,EMRO,0
2,2020-01-05,AF,Afghanistan,EMRO,0
3,2020-01-06,AF,Afghanistan,EMRO,0
4,2020-01-07,AF,Afghanistan,EMRO,0
5,2020-01-08,AF,Afghanistan,EMRO,0
6,2020-01-09,AF,Afghanistan,EMRO,0
7,2020-01-10,AF,Afghanistan,EMRO,0
8,2020-01-11,AF,Afghanistan,EMRO,0
9,2020-01-12,AF,Afghanistan,EMRO,0


# INDEXAÇÃO BOOLEANA

In [121]:
# seleciona os primerios 10 registros das duas colunas, em que newCases > 150
data[["dataReported","country"]][data["newCases"]==150].head(10)

Unnamed: 0,dataReported,country
868,2022-05-20,Afghanistan
1614,2020-09-23,Albania
2806,2020-04-18,Algeria
7251,2021-05-18,Angola
7273,2021-06-09,Angola
7345,2021-08-20,Angola
7354,2021-08-29,Angola
7511,2022-02-02,Angola
12406,2020-09-15,Armenia
12704,2021-07-10,Armenia


In [None]:
# continuar..